From f72b4891bc9af481fc55d20dd91497303ffa2919 Mon Sep 17 00:00:00 2001 From: Eric Larson Date: Thu, 21 Nov 2024 13:24:24 -0600 Subject: [PATCH] WIP: adding WOF download/extract --- .gitignore | 7 +- Gemfile | 1 + Gemfile.lock | 3 + db/geonames/readme.txt | 141 ---------------- lib/tasks/geoportal/gazetteer/geonames.rake | 177 ++++++++++++++++++++ lib/tasks/geoportal/gazetteer/wof.rake | 58 +++++++ lib/tasks/geoportal/geoportal_geonames.rake | 174 ------------------- 7 files changed, 241 insertions(+), 320 deletions(-) delete mode 100644 db/geonames/readme.txt create mode 100644 lib/tasks/geoportal/gazetteer/geonames.rake create mode 100644 lib/tasks/geoportal/gazetteer/wof.rake delete mode 100644 lib/tasks/geoportal/geoportal_geonames.rake diff --git a/.gitignore b/.gitignore index a1e67b74f..d67a35215 100644 --- a/.gitignore +++ b/.gitignore @@ -61,8 +61,5 @@ node_modules # Vite uses dotenv and suggests to ignore local-only env files. See # https://vitejs.dev/guide/env-and-mode.html#env-files *.local -db/geonames/allCountries.txt -db/geonames/allCountries.zip -db/geonames/geonames_export.csv -db/geonames/US.txt -db/geonames/US.zip +db/gazetteer/geonames/* +db/gazetteer/wof/* \ No newline at end of file diff --git a/Gemfile b/Gemfile index 97ef0a539..3911c7b0b 100644 --- a/Gemfile +++ b/Gemfile @@ -162,3 +162,4 @@ gem "aws-sdk-s3", "~> 1.14" # Progress bar gem "activerecord-import" gem 'ruby-progressbar', '~> 1.13' +gem 'bzip2-ffi' \ No newline at end of file diff --git a/Gemfile.lock b/Gemfile.lock index 2ba614dd6..f3b258cce 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -235,6 +235,8 @@ GEM popper_js (>= 1.16.1, < 2) builder (3.3.0) byebug (11.1.3) + bzip2-ffi (1.1.1) + ffi (~> 1.0) capybara (3.40.0) addressable matrix @@ -864,6 +866,7 @@ DEPENDENCIES bootsnap (~> 1.9.3) bootstrap (~> 4.0) byebug + bzip2-ffi capybara capybara-screenshot capybara-selenium diff --git a/db/geonames/readme.txt b/db/geonames/readme.txt deleted file mode 100644 index 0440fdb4c..000000000 --- a/db/geonames/readme.txt +++ /dev/null @@ -1,141 +0,0 @@ - -Readme for GeoNames Gazetteer extract files - -============================================================================================================ - -This work is licensed under a Creative Commons Attribution 4.0 License, -see https://creativecommons.org/licenses/by/4.0/ -The Data is provided "as is" without warranty or any representation of accuracy, timeliness or completeness. - -The data format is tab-delimited text in utf8 encoding. - - -Files : -------- -XX.zip : features for country with iso code XX, see 'geoname' table for columns. 'no-country' for features not belonging to a country. -allCountries.zip : all countries combined in one file, see 'geoname' table for columns -cities500.zip : all cities with a population > 500 or seats of adm div down to PPLA4 (ca 185.000), see 'geoname' table for columns -cities1000.zip : all cities with a population > 1000 or seats of adm div down to PPLA3 (ca 130.000), see 'geoname' table for columns -cities5000.zip : all cities with a population > 5000 or PPLA (ca 50.000), see 'geoname' table for columns -cities15000.zip : all cities with a population > 15000 or capitals (ca 25.000), see 'geoname' table for columns -alternateNamesV2.zip : alternate names with language codes and geonameId, file with iso language codes, with new columns from and to -alternateNames.zip : obsolete use V2, this file does not have the new columns to and from and will be removed in the future -admin1CodesASCII.txt : names in English for admin divisions. Columns: code, name, name ascii, geonameid -admin2Codes.txt : names for administrative subdivision 'admin2 code' (UTF8), Format : concatenated codes name asciiname geonameId -iso-languagecodes.txt : iso 639 language codes, as used for alternate names in file alternateNames.zip -featureCodes.txt : name and description for feature classes and feature codes -timeZones.txt : countryCode, timezoneId, gmt offset on 1st of January, dst offset to gmt on 1st of July (of the current year), rawOffset without DST -countryInfo.txt : country information : iso codes, fips codes, languages, capital ,... - see the geonames webservices for additional country information, - bounding box : http://api.geonames.org/countryInfo? - country names in different languages : http:/api.geonames.org/countryInfoCSV?lang=it -modifications-.txt : all records modified on the previous day, the date is in yyyy-MM-dd format. You can use this file to daily synchronize your own geonames database. -deletes-.txt : all records deleted on the previous day, format : geonameId name comment. - -alternateNamesModifications-.txt : all alternate names modified on the previous day, -alternateNamesDeletes-.txt : all alternate names deleted on the previous day, format : alternateNameId geonameId name comment. -userTags.zip : user tags , format : geonameId tag. -hierarchy.zip : parentId, childId, type. The type 'ADM' stands for the admin hierarchy modeled by the admin1-4 codes. The other entries are entered with the user interface. The relation toponym-adm hierarchy is not included in the file, it can instead be built from the admincodes of the toponym. -adminCode5.zip : the new adm5 column is not yet exported in the other files (in order to not break import scripts). Instead it is availabe as separate file. - columns: geonameId,adm5code - -The main 'geoname' table has the following fields : ---------------------------------------------------- -geonameid : integer id of record in geonames database -name : name of geographical point (utf8) varchar(200) -asciiname : name of geographical point in plain ascii characters, varchar(200) -alternatenames : alternatenames, comma separated, ascii names automatically transliterated, convenience attribute from alternatename table, varchar(10000) -latitude : latitude in decimal degrees (wgs84) -longitude : longitude in decimal degrees (wgs84) -feature class : see http://www.geonames.org/export/codes.html, char(1) -feature code : see http://www.geonames.org/export/codes.html, varchar(10) -country code : ISO-3166 2-letter country code, 2 characters -cc2 : alternate country codes, comma separated, ISO-3166 2-letter country code, 200 characters -admin1 code : fipscode (subject to change to iso code), see exceptions below, see file admin1Codes.txt for display names of this code; varchar(20) -admin2 code : code for the second administrative division, a county in the US, see file admin2Codes.txt; varchar(80) -admin3 code : code for third level administrative division, varchar(20) -admin4 code : code for fourth level administrative division, varchar(20) -population : bigint (8 byte int) -elevation : in meters, integer -dem : digital elevation model, srtm3 or gtopo30, average elevation of 3''x3'' (ca 90mx90m) or 30''x30'' (ca 900mx900m) area in meters, integer. srtm processed by cgiar/ciat. -timezone : the iana timezone id (see file timeZone.txt) varchar(40) -modification date : date of last modification in yyyy-MM-dd format - - -AdminCodes: -Most adm1 are FIPS codes. ISO codes are used for US, CH, BE and ME. UK and Greece are using an additional level between country and fips code. The code '00' stands for general features where no specific adm1 code is defined. -The corresponding admin feature is found with the same countrycode and adminX codes and the respective feature code ADMx. - - - -The table 'alternate names' : ------------------------------ -alternateNameId : the id of this alternate name, int -geonameid : geonameId referring to id in table 'geoname', int -isolanguage : iso 639 language code 2- or 3-characters, optionally followed by a hyphen and a countrycode for country specific variants (ex:zh-CN) or by a variant name (ex: zh-Hant); 4-characters 'post' for postal codes and 'iata','icao' and faac for airport codes, fr_1793 for French Revolution names, abbr for abbreviation, link to a website (mostly to wikipedia), wkdt for the wikidataid, varchar(7) -alternate name : alternate name or name variant, varchar(400) -isPreferredName : '1', if this alternate name is an official/preferred name -isShortName : '1', if this is a short name like 'California' for 'State of California' -isColloquial : '1', if this alternate name is a colloquial or slang term. Example: 'Big Apple' for 'New York'. -isHistoric : '1', if this alternate name is historic and was used in the past. Example 'Bombay' for 'Mumbai'. -from : from period when the name was used -to : to period when the name was used - -Remark : the field 'alternatenames' in the table 'geoname' is a short version of the 'alternatenames' table without links and postal codes but with ascii transliterations. You probably don't need both. -If you don't need to know the language of a name variant, the field 'alternatenames' will be sufficient. If you need to know the language -of a name variant, then you will need to load the table 'alternatenames' and you can drop the column in the geoname table. - - - - -Boundaries: -Simplified country boundaries are available in two slightly different formats: -shapes_simplified_low: -geonameId: The geonameId of the feature -geoJson: The boundary in geoJson format - -shapes_simplified_low.json: -similar to the abovementioned file, but fully in geojson format. The geonameId is a feature property in the geojson string. - - -Statistics on the number of features per country and the feature class and code distributions : http://www.geonames.org/statistics/ - - -Continent codes : -AF : Africa geonameId=6255146 -AS : Asia geonameId=6255147 -EU : Europe geonameId=6255148 -NA : North America geonameId=6255149 -OC : Oceania geonameId=6255151 -SA : South America geonameId=6255150 -AN : Antarctica geonameId=6255152 - - -feature classes: -A: country, state, region,... -H: stream, lake, ... -L: parks,area, ... -P: city, village,... -R: road, railroad -S: spot, building, farm -T: mountain,hill,rock,... -U: undersea -V: forest,heath,... - - -If you find errors or miss important places, please do use the wiki-style edit interface on our website -https://www.geonames.org to correct inaccuracies and to add new records. -Thanks in the name of the geonames community for your valuable contribution. - -Data Sources: -https://www.geonames.org/datasources/ - - -More Information is also available in the geonames faq : - -https://forum.geonames.org/gforum/forums/show/6.page - -The forum : https://forum.geonames.org - -or the google group : https://groups.google.com/group/geonames - diff --git a/lib/tasks/geoportal/gazetteer/geonames.rake b/lib/tasks/geoportal/gazetteer/geonames.rake new file mode 100644 index 000000000..cf9322b15 --- /dev/null +++ b/lib/tasks/geoportal/gazetteer/geonames.rake @@ -0,0 +1,177 @@ +require 'csv' +require 'fileutils' +require 'open-uri' +require 'rsolr' +require 'zip' + +# Geoportal Gazetteer Geonames Tasks +# Order of execution: +# 1. download +# 2. import +# 3. export +# 4. reindex_solr +namespace :geoportal do + namespace :gazetteer do + namespace :geonames do + desc "Download and extract allCountries.zip from Geonames" + task download: :environment do + url = 'https://download.geonames.org/export/dump/US.zip' + zip_path = Rails.root.join('db', 'gazetteer', 'geonames', 'US.zip') + extract_path = Rails.root.join('db', 'gazetteer', 'geonames') + + # Ensure the directory exists + FileUtils.mkdir_p(extract_path) + + # Download the zip file + begin + puts "Downloading file from #{url}..." + URI.open(url) do |remote_file| + File.open(zip_path, 'wb') do |file| + file.write(remote_file.read) + end + end + rescue OpenURI::HTTPError => e + puts "Failed to download file: #{e.message}" + return + rescue Errno::ENOENT => e + puts "Error accessing URL: #{e.message}" + return + end + + # Extract the zip file + Zip::File.open(zip_path) do |zip_file| + zip_file.each do |entry| + entry.extract(File.join(extract_path, entry.name)) { true } + end + end + + puts "Download and extraction completed successfully." + end + + desc "Import US.txt data into the Geonames table" + task import: :environment do + file_path = Rails.root.join('db', 'gazetteer', 'geonames', 'US.txt') + + # Check if the file exists + unless File.exist?(file_path) + puts "File not found. Downloading..." + Rake::Task['geoportal:gazetteer:geonames:download'].invoke + end + + # Geonames Array + geonames = [] + + # Count the total number of lines in the file + total_lines = `wc -l "#{file_path}"`.strip.split(' ')[0].to_i + + # Initialize the progress bar + progress_bar = ProgressBar.create(total: total_lines, format: "%a %b\u{15E7}%i %p%% %t") + + File.open(file_path, 'r') do |f| + until f.eof? + begin + text = f.readline + row = CSV.parse_line(text, col_sep: "\t", headers: false) + geonames << { + geonameid: row[0], + name: row[1], + asciiname: row[2], + alternatenames: row[3], + latitude: row[4], + longitude: row[5], + feature_class: row[6], + feature_code: row[7], + country_code: row[8], + cc2: row[9], + admin1_code: row[10], + admin2_code: row[11], + admin3_code: row[12], + admin4_code: row[13], + population: row[14], + elevation: row[15], + dem: row[16], + timezone: row[17], + modification_date: row[18] + } + + # Import every 100000 records + if geonames.size >= 100000 + Geoname.import(geonames, validate: false) + geonames.clear + end + + # Increment the progress bar + progress_bar.increment + rescue StandardError => e + puts "Error processing line: #{e.message}" + end + end + end + + # Import any remaining records + Geoname.import(geonames, validate: false) unless geonames.empty? + + puts "Geonames import completed successfully." + end + + desc "Export Geoname table to a CSV file using PostgreSQL COPY" + task export: :environment do + file_path = Rails.root.join('db', 'gazetteer', 'geonames', 'geonames_export.csv') + + # Execute the COPY command + begin + connection = ActiveRecord::Base.connection + connection.execute <<-SQL + COPY ( + SELECT + geonameid AS geonameid_i, + name, + asciiname AS asciiname_s, + alternatenames AS alternatenames_s, + latitude AS latitude_f, + longitude AS longitude_f, + feature_class AS feature_class_s, + feature_code AS feature_code_s, + country_code AS country_code_s, + cc2 AS cc2_s, + admin1_code AS admin1_code_s, + admin2_code AS admin2_code_s, + admin3_code AS admin3_code_s, + admin4_code AS admin4_code_s, + population AS population_i, + elevation AS elevation_i, + dem AS dem_i, + timezone AS timezone_s, + modification_date AS modification_date_dts, + latitude || ',' || longitude AS location_p + FROM geonames + ) + TO '#{file_path}' WITH CSV HEADER; + SQL + + puts "Geoname table exported to #{file_path} successfully using PostgreSQL COPY." + rescue StandardError => e + puts "Error exporting Geoname table: #{e.message}" + end + end + + desc "Import Geoname entries into Solr" + task reindex_solr: :environment do + # Define the path to the CSV file + csv_file_path = Rails.root.join('db', 'gazetteer', 'geonames', 'geonames_export.csv') + + # Define the Solr update URL + solr_url = "http://localhost:8983/solr/geonames/update?commit=true" + + # Execute the curl command to update Solr + begin + puts "Updating Solr with data from #{csv_file_path}..." + system("curl '#{solr_url}' --data-binary @#{csv_file_path} -H 'Content-type:application/csv'") + puts "Geonames import to Solr completed successfully." + rescue StandardError => e + puts "Error updating Solr: #{e.message}" + end + end + end + end +end diff --git a/lib/tasks/geoportal/gazetteer/wof.rake b/lib/tasks/geoportal/gazetteer/wof.rake new file mode 100644 index 000000000..074a2f8a0 --- /dev/null +++ b/lib/tasks/geoportal/gazetteer/wof.rake @@ -0,0 +1,58 @@ +require 'bzip2/ffi' +require 'csv' +require 'fileutils' +require 'open-uri' +require 'rsolr' + +# Geoportal Gazetteer Who's on First Tasks +# Order of execution: +# 1. download +# 2. import (todo) +namespace :geoportal do + namespace :gazetteer do + namespace :wof do + desc "Download Who's on First sqlite3 database" + task download: :environment do + url = 'https://data.geocode.earth/wof/dist/bundles/whosonfirst-data-admin-us-latest.tar.bz2' + zip_path = Rails.root.join('db', 'gazetteer', 'wof', 'whosonfirst-data-admin-us-latest.tar.bz2') + extract_path = Rails.root.join('db', 'gazetteer', 'wof') + + # Ensure the directory exists + FileUtils.mkdir_p(extract_path) + + # Download the zip file + begin + puts "Downloading file from #{url}..." + URI.open(url) do |remote_file| + File.open(zip_path, 'wb') do |file| + file.write(remote_file.read) + end + end + rescue OpenURI::HTTPError => e + puts "Failed to download file: #{e.message}" + return + rescue Errno::ENOENT => e + puts "Error accessing URL: #{e.message}" + return + end + + # Extract the bz2 file + Bzip2::FFI::Reader.open(zip_path) do |input| + File.open(File.join(extract_path, 'whosonfirst-data-admin-us-latest.tar'), 'wb') do |output| + IO.copy_stream(input, output) + end + end + + # Optionally, extract the tar file if needed + system("tar -xvf #{File.join(extract_path, 'whosonfirst-data-admin-us-latest.tar')} -C #{extract_path}") + + puts "Download and extraction completed successfully." + end + + desc "Import Who's on First data into Rails" + task import: :environment do + # todo + end + end + end +end \ No newline at end of file diff --git a/lib/tasks/geoportal/geoportal_geonames.rake b/lib/tasks/geoportal/geoportal_geonames.rake deleted file mode 100644 index ea2400289..000000000 --- a/lib/tasks/geoportal/geoportal_geonames.rake +++ /dev/null @@ -1,174 +0,0 @@ -require 'csv' -require 'open-uri' -require 'zip' -require 'fileutils' -require 'rsolr' - -namespace :geoportal do - namespace :geonames do - desc "Download and extract allCountries.zip from Geonames" - task download: :environment do - # Takes a day to process - # url = 'https://download.geonames.org/export/dump/allCountries.zip' - # zip_path = Rails.root.join('db', 'geonames', 'allCountries.zip') - - # Takes - url = 'https://download.geonames.org/export/dump/US.zip' - zip_path = Rails.root.join('db', 'geonames', 'US.zip') - extract_path = Rails.root.join('db', 'geonames') - - # Ensure the directory exists - FileUtils.mkdir_p(extract_path) - - # Download the zip file - begin - puts "Downloading file from #{url}..." - URI.open(url) do |remote_file| - File.open(zip_path, 'wb') do |file| - file.write(remote_file.read) - end - end - rescue OpenURI::HTTPError => e - puts "Failed to download file: #{e.message}" - return - rescue Errno::ENOENT => e - puts "Error accessing URL: #{e.message}" - return - end - - # Extract the zip file - Zip::File.open(zip_path) do |zip_file| - zip_file.each do |entry| - entry.extract(File.join(extract_path, entry.name)) { true } - end - end - - puts "Download and extraction completed successfully." - end - - desc "Import US.txt data into the Geonames table" - task import: :environment do - file_path = Rails.root.join('db', 'geonames', 'US.txt') - - # Check if the file exists - unless File.exist?(file_path) - puts "File not found. Downloading..." - Rake::Task['geoportal:geonames:download'].invoke - end - - # Geonames Array - geonames = [] - - # Count the total number of lines in the file - total_lines = `wc -l "#{file_path}"`.strip.split(' ')[0].to_i - - # Initialize the progress bar - progress_bar = ProgressBar.create(total: total_lines, format: "%a %b\u{15E7}%i %p%% %t") - - File.open(file_path, 'r') do |f| - until f.eof? - begin - text = f.readline - row = CSV.parse_line(text, col_sep: "\t", headers: false) - geonames << { - geonameid: row[0], - name: row[1], - asciiname: row[2], - alternatenames: row[3], - latitude: row[4], - longitude: row[5], - feature_class: row[6], - feature_code: row[7], - country_code: row[8], - cc2: row[9], - admin1_code: row[10], - admin2_code: row[11], - admin3_code: row[12], - admin4_code: row[13], - population: row[14], - elevation: row[15], - dem: row[16], - timezone: row[17], - modification_date: row[18] - } - - # Import every 100000 records - if geonames.size >= 100000 - Geoname.import(geonames, validate: false) - geonames.clear - end - - # Increment the progress bar - progress_bar.increment - rescue StandardError => e - puts "Error processing line: #{e.message}" - end - end - end - - # Import any remaining records - Geoname.import(geonames, validate: false) unless geonames.empty? - - puts "Geonames import completed successfully." - end - - desc "Export Geoname table to a CSV file using PostgreSQL COPY" - task export: :environment do - file_path = Rails.root.join('db', 'geonames', 'geonames_export.csv') - - # Execute the COPY command - begin - connection = ActiveRecord::Base.connection - connection.execute <<-SQL - COPY ( - SELECT - geonameid AS geonameid_i, - name, - asciiname AS asciiname_s, - alternatenames AS alternatenames_s, - latitude AS latitude_f, - longitude AS longitude_f, - feature_class AS feature_class_s, - feature_code AS feature_code_s, - country_code AS country_code_s, - cc2 AS cc2_s, - admin1_code AS admin1_code_s, - admin2_code AS admin2_code_s, - admin3_code AS admin3_code_s, - admin4_code AS admin4_code_s, - population AS population_i, - elevation AS elevation_i, - dem AS dem_i, - timezone AS timezone_s, - modification_date AS modification_date_dts, - latitude || ',' || longitude AS location_p - FROM geonames - ) - TO '#{file_path}' WITH CSV HEADER; - SQL - - puts "Geoname table exported to #{file_path} successfully using PostgreSQL COPY." - rescue StandardError => e - puts "Error exporting Geoname table: #{e.message}" - end - end - - desc "Import Geoname entries into Solr" - task reindex_solr: :environment do - # Define the path to the CSV file - csv_file_path = Rails.root.join('db', 'geonames', 'geonames_export.csv') - - # Define the Solr update URL - solr_url = "http://localhost:8983/solr/geonames/update?commit=true" - - # Execute the curl command to update Solr - begin - puts "Updating Solr with data from #{csv_file_path}..." - system("curl '#{solr_url}' --data-binary @#{csv_file_path} -H 'Content-type:application/csv'") - puts "Geonames import to Solr completed successfully." - rescue StandardError => e - puts "Error updating Solr: #{e.message}" - end - end - end -end