Skip to content

Commit

Permalink
WIP: adding WOF download/extract
Browse files Browse the repository at this point in the history
  • Loading branch information
ewlarson committed Nov 21, 2024
1 parent c02fd44 commit f72b489
Show file tree
Hide file tree
Showing 7 changed files with 241 additions and 320 deletions.
7 changes: 2 additions & 5 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,5 @@ node_modules
# Vite uses dotenv and suggests to ignore local-only env files. See
# https://vitejs.dev/guide/env-and-mode.html#env-files
*.local
db/geonames/allCountries.txt
db/geonames/allCountries.zip
db/geonames/geonames_export.csv
db/geonames/US.txt
db/geonames/US.zip
db/gazetteer/geonames/*
db/gazetteer/wof/*
1 change: 1 addition & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -162,3 +162,4 @@ gem "aws-sdk-s3", "~> 1.14"
# Progress bar
gem "activerecord-import"
gem 'ruby-progressbar', '~> 1.13'
gem 'bzip2-ffi'
3 changes: 3 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,8 @@ GEM
popper_js (>= 1.16.1, < 2)
builder (3.3.0)
byebug (11.1.3)
bzip2-ffi (1.1.1)
ffi (~> 1.0)
capybara (3.40.0)
addressable
matrix
Expand Down Expand Up @@ -864,6 +866,7 @@ DEPENDENCIES
bootsnap (~> 1.9.3)
bootstrap (~> 4.0)
byebug
bzip2-ffi
capybara
capybara-screenshot
capybara-selenium
Expand Down
141 changes: 0 additions & 141 deletions db/geonames/readme.txt

This file was deleted.

177 changes: 177 additions & 0 deletions lib/tasks/geoportal/gazetteer/geonames.rake
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
require 'csv'
require 'fileutils'
require 'open-uri'
require 'rsolr'
require 'zip'

# Geoportal Gazetteer Geonames Tasks
# Order of execution:
# 1. download
# 2. import
# 3. export
# 4. reindex_solr
namespace :geoportal do
namespace :gazetteer do
namespace :geonames do
desc "Download and extract allCountries.zip from Geonames"
task download: :environment do
url = 'https://download.geonames.org/export/dump/US.zip'
zip_path = Rails.root.join('db', 'gazetteer', 'geonames', 'US.zip')
extract_path = Rails.root.join('db', 'gazetteer', 'geonames')

# Ensure the directory exists
FileUtils.mkdir_p(extract_path)

# Download the zip file
begin
puts "Downloading file from #{url}..."
URI.open(url) do |remote_file|
File.open(zip_path, 'wb') do |file|
file.write(remote_file.read)
end
end
rescue OpenURI::HTTPError => e
puts "Failed to download file: #{e.message}"
return
rescue Errno::ENOENT => e
puts "Error accessing URL: #{e.message}"
return
end

# Extract the zip file
Zip::File.open(zip_path) do |zip_file|
zip_file.each do |entry|
entry.extract(File.join(extract_path, entry.name)) { true }
end
end

puts "Download and extraction completed successfully."
end

desc "Import US.txt data into the Geonames table"
task import: :environment do
file_path = Rails.root.join('db', 'gazetteer', 'geonames', 'US.txt')

# Check if the file exists
unless File.exist?(file_path)
puts "File not found. Downloading..."
Rake::Task['geoportal:gazetteer:geonames:download'].invoke
end

# Geonames Array
geonames = []

# Count the total number of lines in the file
total_lines = `wc -l "#{file_path}"`.strip.split(' ')[0].to_i

# Initialize the progress bar
progress_bar = ProgressBar.create(total: total_lines, format: "%a %b\u{15E7}%i %p%% %t")

File.open(file_path, 'r') do |f|
until f.eof?
begin
text = f.readline
row = CSV.parse_line(text, col_sep: "\t", headers: false)
geonames << {
geonameid: row[0],
name: row[1],
asciiname: row[2],
alternatenames: row[3],
latitude: row[4],
longitude: row[5],
feature_class: row[6],
feature_code: row[7],
country_code: row[8],
cc2: row[9],
admin1_code: row[10],
admin2_code: row[11],
admin3_code: row[12],
admin4_code: row[13],
population: row[14],
elevation: row[15],
dem: row[16],
timezone: row[17],
modification_date: row[18]
}

# Import every 100000 records
if geonames.size >= 100000
Geoname.import(geonames, validate: false)
geonames.clear
end

# Increment the progress bar
progress_bar.increment
rescue StandardError => e
puts "Error processing line: #{e.message}"
end
end
end

# Import any remaining records
Geoname.import(geonames, validate: false) unless geonames.empty?

puts "Geonames import completed successfully."
end

desc "Export Geoname table to a CSV file using PostgreSQL COPY"
task export: :environment do
file_path = Rails.root.join('db', 'gazetteer', 'geonames', 'geonames_export.csv')

# Execute the COPY command
begin
connection = ActiveRecord::Base.connection
connection.execute <<-SQL
COPY (
SELECT
geonameid AS geonameid_i,
name,
asciiname AS asciiname_s,
alternatenames AS alternatenames_s,
latitude AS latitude_f,
longitude AS longitude_f,
feature_class AS feature_class_s,
feature_code AS feature_code_s,
country_code AS country_code_s,
cc2 AS cc2_s,
admin1_code AS admin1_code_s,
admin2_code AS admin2_code_s,
admin3_code AS admin3_code_s,
admin4_code AS admin4_code_s,
population AS population_i,
elevation AS elevation_i,
dem AS dem_i,
timezone AS timezone_s,
modification_date AS modification_date_dts,
latitude || ',' || longitude AS location_p
FROM geonames
)
TO '#{file_path}' WITH CSV HEADER;
SQL

puts "Geoname table exported to #{file_path} successfully using PostgreSQL COPY."
rescue StandardError => e
puts "Error exporting Geoname table: #{e.message}"
end
end

desc "Import Geoname entries into Solr"
task reindex_solr: :environment do
# Define the path to the CSV file
csv_file_path = Rails.root.join('db', 'gazetteer', 'geonames', 'geonames_export.csv')

# Define the Solr update URL
solr_url = "http://localhost:8983/solr/geonames/update?commit=true"

# Execute the curl command to update Solr
begin
puts "Updating Solr with data from #{csv_file_path}..."
system("curl '#{solr_url}' --data-binary @#{csv_file_path} -H 'Content-type:application/csv'")
puts "Geonames import to Solr completed successfully."
rescue StandardError => e
puts "Error updating Solr: #{e.message}"
end
end
end
end
end
Loading

0 comments on commit f72b489

Please sign in to comment.