Skip to content

Commit

Permalink
Merge pull request #220 from pelias/download_adapter
Browse files Browse the repository at this point in the history
abstract TIGER download source using adapter pattern
  • Loading branch information
missinglink authored Sep 30, 2019
2 parents 44d200b + 42352e5 commit 8bf2d3d
Show file tree
Hide file tree
Showing 7 changed files with 280 additions and 31 deletions.
2 changes: 2 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
"dependencies": {
"@mapbox/polyline": "^1.0.0",
"async": "^3.1.0",
"cheerio": "^1.0.0-rc.3",
"cli-table3": "^0.5.0",
"csv-parse": "^4.4.6",
"express": "^4.14.0",
Expand All @@ -42,6 +43,7 @@
"serve-index": "^1.8.0",
"split2": "^3.0.0",
"sqlite3": "^4.0.0",
"superagent": "^5.1.0",
"through2": "^3.0.0",
"through2-batch": "^1.0.1",
"unzip": "^0.1.11"
Expand Down
25 changes: 25 additions & 0 deletions script/js/adapter/CensusFTP.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
const JSFtp = require('jsftp');

class CensusFTP {
constructor(){
this.client = new JSFtp({ host: 'ftp2.census.gov' });
this.prefix = '/geo/tiger/TIGER2016/ADDRFEAT';
}
list(pattern, cb){
this.client.list(`${this.prefix}/${pattern}`, (err, res) => {
if (err) { return cb(err); }
// output of the list command looks like a typical ls command in unix
// this line will split the output into lines, and from each line grab the end of the file
// (all filenames are fixed length 27 chars)
// then it will trim the names and filter out any empty ones
let files = res.split('\n').map((file) => (file.substr(-27).trim())).filter((file) => (file.length > 0));

cb(null, files);
});
}
get(remoteFileName, localFilePath, cb){
this.client.get(`${this.prefix}/${remoteFileName}`, localFilePath, cb);
}
}

module.exports = CensusFTP;
54 changes: 54 additions & 0 deletions script/js/adapter/CensusS3Mirror.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
const fs = require('fs');
const path = require('path');
const request = require('superagent');
const cheerio = require('cheerio');
const conform = /^tl_2016_(\d{5})_addrfeat\.zip$/;

class CensusS3Mirror {
constructor() {
this.host = 'https://census-backup.s3.amazonaws.com';
this.prefix = '/tiger/2016/ADDRFEAT';
}
list(pattern, cb) {

// convert glob-style pattern to regex
let regex = new RegExp('^' + pattern.replace(/\*/g, '.*').replace(/\?/g, '.') + '$');

request
.get(`${this.host}${this.prefix}/index.html`)
.set('accept', 'text/html')
.end((err, res) => {
if (err) { return cb(err); }
if (res.status >= 400){ return cb(`status code: ${res.status}`); }

// parse HTML
const $ = cheerio.load(res.text);
let links = $('a').map(function (i) {
return $(this).attr('href');
}).get();

// remove path prefixes
links = links.map(l => path.basename(l));

// filter by regex (to remove any other links on the page)
links = links.filter(l => conform.test(l));

// apply pattern filter
links = links.filter(l => regex.test(l));

cb(null, links);
});
}
get(remoteFileName, localFilePath, cb) {
const sink = fs.createWriteStream(localFilePath);
sink.on('finish', () => cb());

// download remote file to local file path
request
.get(`${this.host}${this.prefix}/${remoteFileName}`)
.on('error', (error) => cb(error))
.pipe(sink);
}
}

module.exports = CensusS3Mirror;
67 changes: 37 additions & 30 deletions script/js/update_tiger.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
const JSFtp = require('jsftp');
const async = require('async');
const path = require('path');
const fs = require('fs-extra');
Expand All @@ -7,6 +6,8 @@ const logger = require('pelias-logger').get('interpolation(TIGER)');
const config = require('pelias-config').generate();
const _ = require('lodash');

const CensusS3Mirror = require('./adapter/CensusS3Mirror');
const adapter = new CensusS3Mirror();

let TARGET_DIR = _.get(config, 'imports.interpolation.download.tiger.datapath', './data/downloads');
let STATES = _.get(config, 'imports.interpolation.download.tiger.states', []);
Expand Down Expand Up @@ -40,9 +41,6 @@ async.eachSeries(STATES, download, (err)=>{

function download(state, callback) {
const context = {
ftp: new JSFtp({
host: 'ftp2.census.gov'
}),
stateCode: state.hasOwnProperty('state_code') ? parseInt(state.state_code, 10) : '',
countyCode: state.hasOwnProperty('county_code') ? parseInt(state.county_code, 10) : '',
files: []
Expand Down Expand Up @@ -77,17 +75,10 @@ function getFilteredFileList(context, callback) {
return callback();
}
}
context.ftp.list(`/geo/tiger/TIGER2016/ADDRFEAT/tl_2016_${filter}*.zip`, (err, res) => {
if (err) {
return callback(err);
}

// output of the list command looks like a typical ls command in unix
// this line will split the output into lines, and from each line grab the end of the file
// (all filenames are fixed length 27 chars)
// then it will trim the names and filter out any empty ones
context.files = res.split('\n').map((file)=>(file.substr(-27).trim())).filter((file)=>(file.length > 0));

adapter.list(`tl_2016_${filter}*.zip`, (err, files) => {
if (err) { return callback(err); }
logger.info(`Queuing ${files.length} downloads`);
context.files = files;
callback();
});
}
Expand All @@ -96,33 +87,49 @@ function downloadFilteredFiles(context, callback) {
context.downloadsDir = path.join(TARGET_DIR, 'downloads');
context.shapefilesDir = path.join(TARGET_DIR, 'shapefiles');

// ensure directories exist
fs.ensureDirSync(context.downloadsDir);
fs.ensureDirSync(context.shapefilesDir);

// ensure directories are writable
fs.accessSync(context.downloadsDir, fs.constants.R_OK | fs.constants.W_OK);
fs.accessSync(context.shapefilesDir, fs.constants.R_OK | fs.constants.W_OK);

// must use eachSeries here because the ftp connection only allows one download at a time
async.eachSeries(context.files, downloadFile.bind(null, context), callback);
}

function downloadFile(context, filename, callback) {
const localFile = path.join(context.downloadsDir, filename);

context.ftp.get(`/geo/tiger/TIGER2016/ADDRFEAT/${filename}`, localFile, (err)=> {
if (err) {
return callback(err);
}
adapter.get(filename, localFile, (err) => {
logger.info(`Downloading ${filename}`);
if (err) { return callback(err); }
logger.debug(`Downloaded ${filename}`);

logger.info(`Downloaded ${filename}`);
// record unzip errors
let unzipError = null;

// unzip downloaded file
fs.createReadStream(localFile).pipe(unzip.Extract({ path: context.shapefilesDir })).on('finish', (err) => {
if (err) {
logger.error(`Failed to unzip ${filename}`);
return callback(err);
}

// delete zip file after unzip is done
fs.unlinkSync(localFile);
callback();
// decompress files to shapefile directory
const decompress = unzip.Extract({ path: context.shapefilesDir });
decompress.on('error', (err) => {
unzipError = err;
logger.error(`Failed to unzip ${filename}`);
logger.error(err);
});

// unzip downloaded file
logger.info(`Decompressing ${filename}`);
fs.createReadStream(localFile)
.pipe(decompress)
.on('finish', () => {
logger.debug(`Decompressed ${filename}`);

// delete zip file after unzip is done
fs.unlinkSync(localFile);

// return unzip error if one occurred
callback(unzipError);
});
});
}
4 changes: 3 additions & 1 deletion test/_unit.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@ var tests = [
require('./stream/split.js'),
require('./stream/oa/convert.js'),
require('./stream/osm/convert.js'),
require('./stream/osm/delimited_ranges.js')
require('./stream/osm/delimited_ranges.js'),
// require('./script/js/adapter/CensusFTP'),
require('./script/js/adapter/CensusS3Mirror')
];

tests.map(function(t) {
Expand Down
83 changes: 83 additions & 0 deletions test/script/js/adapter/CensusFTP.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
const fs = require('fs');
const os = require('os');
const path = require('path');
const crypto = require('crypto');
const JSFtp = require('jsftp');
const CensusFTP = require('../../../../script/js/adapter/CensusFTP');

module.exports.tests = {};

module.exports.tests.constructor = function (test) {
test('constructor', function (t) {
const adapter = new CensusFTP();
t.true(adapter.client instanceof JSFtp);
t.equal(typeof adapter.prefix, 'string');
adapter.client.socket.end();
t.end();
});
};

module.exports.tests.list = function (test) {
const conform = /^tl_2016_(\d{5})_addrfeat\.zip$/;
test('list - all', function (t) {
const adapter = new CensusFTP();
adapter.list('tl_2016_*_addrfeat.zip', (err, files) => {
t.equal(files.length, 3220);
t.true(files.every(f => conform.test(f)));
adapter.client.socket.end();
t.end();
});
});
test('list - whole state', function (t) {
const adapter = new CensusFTP();
adapter.list('tl_2016_72*_addrfeat.zip', (err, files) => {
t.equal(files.length, 78);
t.true(files.every(f => conform.test(f)));
adapter.client.socket.end();
t.end();
});
});
test('list - subset of state', function (t) {
const adapter = new CensusFTP();
adapter.list('tl_2016_7200*_addrfeat.zip', (err, files) => {
t.equal(files.length, 5);
t.true(files.every(f => conform.test(f)));
adapter.client.socket.end();
t.end();
});
});
test('list - single file', function (t) {
const adapter = new CensusFTP();
adapter.list('tl_2016_72001_addrfeat.zip', (err, files) => {
t.equal(files.length, 1);
t.true(files.every(f => conform.test(f)));
adapter.client.socket.end();
t.end();
});
});
};

module.exports.tests.get = function (test) {
test('get - single file', function (t) {
const adapter = new CensusFTP();
const tmpFile = path.join(os.tmpdir(), crypto.randomBytes(16).toString('hex'));
adapter.get('tl_2016_72149_addrfeat.zip', tmpFile, (err) => {
const stats = fs.statSync(tmpFile);
t.equal(stats.size, 42950);
adapter.client.socket.end();
fs.unlinkSync(tmpFile); // clean up
t.end();
});
});
};

module.exports.all = function (tape) {

function test(name, testFunction) {
return tape('CensusFTP: ' + name, testFunction);
}

for (var testCase in module.exports.tests) {
module.exports.tests[testCase](test);
}
};
76 changes: 76 additions & 0 deletions test/script/js/adapter/CensusS3Mirror.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
const os = require('os');
const fs = require('fs');
const path = require('path');
const crypto = require('crypto');
const CensusS3Mirror = require('../../../../script/js/adapter/CensusS3Mirror');

module.exports.tests = {};

module.exports.tests.constructor = function (test) {
test('constructor', function (t) {
const adapter = new CensusS3Mirror();
t.equal(typeof adapter.host, 'string');
t.equal(typeof adapter.prefix, 'string');
t.end();
});
};

module.exports.tests.list = function (test) {
const conform = /^tl_2016_(\d{5})_addrfeat\.zip$/;
test('list - all', function (t) {
const adapter = new CensusS3Mirror();
adapter.list('tl_2016_*_addrfeat.zip', (err, files) => {
t.equal(files.length, 3220);
t.true(files.every(f => conform.test(f)));
t.end();
});
});
test('list - whole state', function (t) {
const adapter = new CensusS3Mirror();
adapter.list('tl_2016_72*_addrfeat.zip', (err, files) => {
t.equal(files.length, 78);
t.true(files.every(f => conform.test(f)));
t.end();
});
});
test('list - subset of state', function (t) {
const adapter = new CensusS3Mirror();
adapter.list('tl_2016_7200*_addrfeat.zip', (err, files) => {
t.equal(files.length, 5);
t.true(files.every(f => conform.test(f)));
t.end();
});
});
test('list - single file', function (t) {
const adapter = new CensusS3Mirror();
adapter.list('tl_2016_72001_addrfeat.zip', (err, files) => {
t.equal(files.length, 1);
t.true(files.every(f => conform.test(f)));
t.end();
});
});
};

module.exports.tests.get = function (test) {
test('get - single file', function (t) {
const adapter = new CensusS3Mirror();
const tmpFile = path.join(os.tmpdir(), crypto.randomBytes(16).toString('hex'));
adapter.get('tl_2016_72149_addrfeat.zip', tmpFile, (err) => {
const stats = fs.statSync(tmpFile);
t.equal(stats.size, 42950);
fs.unlinkSync(tmpFile); // clean up
t.end();
});
});
};

module.exports.all = function (tape) {

function test(name, testFunction) {
return tape('CensusS3Mirror: ' + name, testFunction);
}

for (var testCase in module.exports.tests) {
module.exports.tests[testCase](test);
}
};

0 comments on commit 8bf2d3d

Please sign in to comment.