Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor header processing #262

Merged
merged 3 commits into from
Dec 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 37 additions & 29 deletions lib/smarter_csv/headers.rb
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,14 @@ def process_headers(filehandle, options)
# the first line of a CSV file contains the header .. it might be commented out, so we need to read it anyhow
header_line = @raw_header = readline_with_counts(filehandle, options)
header_line = preprocess_header_line(header_line, options)
file_header_array, file_header_size = parse_and_modify_headers(header_line, options)

file_header_array, file_header_size = parse(header_line, options)

# header transformations:
file_header_array = transform_headers(file_header_array, options)

# currently this is, but should not be called on user_provided headers
file_header_array = legacy_header_transformations(file_header_array, options)
else
unless options[:user_provided_headers]
raise SmarterCSV::IncorrectOption, "ERROR: If :headers_in_file is set to false, you have to provide :user_provided_headers"
Expand All @@ -36,21 +43,19 @@ def process_headers(filehandle, options)
# we could print out the mapping of file_header_array to header_array here
end
end

header_array = user_header_array

# these 3 steps should only be part of the header transformation when headers_in_file:
# -> breaking change when we move this to transform_headers()
# see details in legacy_header_transformations()
#
header_array = legacy_header_transformations(header_array, options)
else
header_array = file_header_array
end

# detect duplicate headers and disambiguate
header_array = disambiguate_headers(header_array, options) if options[:duplicate_header_suffix]

# symbolize headers
header_array.map!{|x| x.to_sym } unless options[:strings_as_keys] || options[:keep_original_headers]

# wouldn't make sense to re-map user provided headers
header_array = remap_headers(header_array, options) if options[:key_mapping] && !options[:user_provided_headers]

validate_and_deprecate_headers(header_array, options)
validate_headers(header_array, options)

[header_array, header_array.size]
end
Expand All @@ -65,17 +70,29 @@ def preprocess_header_line(header_line, options)
header_line
end

def parse_and_modify_headers(header_line, options)
file_header_array, file_header_size = parse(header_line, options)

file_header_array.map!{|x| x.gsub(%r/#{options[:quote_char]}/, '')}
file_header_array.map!{|x| x.strip} if options[:strip_whitespace]
# transform the headers that were in the file:
def transform_headers(header_array, options)
header_array.map!{|x| x.gsub(%r/#{options[:quote_char]}/, '')}
header_array.map!{|x| x.strip} if options[:strip_whitespace]

unless options[:keep_original_headers]
file_header_array.map!{|x| x.gsub(/\s+|-+/, '_')}
file_header_array.map!{|x| x.downcase} if options[:downcase_header]
header_array.map!{|x| x.gsub(/\s+|-+/, '_')}
header_array.map!{|x| x.downcase} if options[:downcase_header]
end
[file_header_array, file_header_size]

header_array
end

def legacy_header_transformations(header_array, options)
# detect duplicate headers and disambiguate
# -> user_provided_headers should not have duplicates!
header_array = disambiguate_headers(header_array, options) if options[:duplicate_header_suffix]
# symbolize headers
# -> user_provided_headers should already be symbols or strings as needed
header_array = header_array.map{|x| x.to_sym } unless options[:strings_as_keys] || options[:keep_original_headers]
# doesn't make sense to re-map when we have user_provided_headers
header_array = remap_headers(header_array, options) if options[:key_mapping] && !options[:user_provided_headers]
header_array
end

def disambiguate_headers(headers, options)
Expand Down Expand Up @@ -117,7 +134,7 @@ def remap_headers(headers, options)
end

# header_validations
def validate_and_deprecate_headers(headers, options)
def validate_headers(headers, options)
duplicate_headers = []
headers.compact.each do |k|
duplicate_headers << k if headers.select{|x| x == k}.size > 1
Expand All @@ -127,15 +144,6 @@ def validate_and_deprecate_headers(headers, options)
raise SmarterCSV::DuplicateHeaders, "ERROR: duplicate headers: #{duplicate_headers.join(',')}"
end

# deprecate required_headers
unless options[:required_headers].nil?
puts "DEPRECATION WARNING: please use 'required_keys' instead of 'required_headers'"
if options[:required_keys].nil?
options[:required_keys] = options[:required_headers]
options[:required_headers] = nil
end
end

if options[:required_keys] && options[:required_keys].is_a?(Array)
missing_keys = []
options[:required_keys].each do |k|
Expand Down
9 changes: 9 additions & 0 deletions lib/smarter_csv/options_processing.rb
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,15 @@ def default_options
private

def validate_options!(options)
# deprecate required_headers
unless options[:required_headers].nil?
puts "DEPRECATION WARNING: please use 'required_keys' instead of 'required_headers'"
if options[:required_keys].nil?
options[:required_keys] = options[:required_headers]
options[:required_headers] = nil
end
end

keys = options.keys
errors = []
errors << "invalid row_sep" if keys.include?(:row_sep) && !option_valid?(options[:row_sep])
Expand Down
13 changes: 7 additions & 6 deletions lib/smarter_csv/smarter_csv.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,23 +12,24 @@ class KeyMappingError < SmarterCSVException; end

# first parameter: filename or input object which responds to readline method
def SmarterCSV.process(input, given_options = {}, &block) # rubocop:disable Lint/UnusedMethodArgument
options = process_options(given_options)

initialize_variables

options = process_options(given_options)

has_rails = !!defined?(Rails)

begin
fh = input.respond_to?(:readline) ? input : File.open(input, "r:#{options[:file_encoding]}")

if (options[:force_utf8] || options[:file_encoding] =~ /utf-8/i) && (fh.respond_to?(:external_encoding) && fh.external_encoding != Encoding.find('UTF-8') || fh.respond_to?(:encoding) && fh.encoding != Encoding.find('UTF-8'))
puts 'WARNING: you are trying to process UTF-8 input, but did not open the input with "b:utf-8" option. See README file "NOTES about File Encodings".'
end

# auto-detect the row separator
options[:row_sep] = guess_line_ending(fh, options) if options[:row_sep]&.to_sym == :auto
# attempt to auto-detect column separator
options[:col_sep] = guess_column_separator(fh, options) if options[:col_sep]&.to_sym == :auto

if (options[:force_utf8] || options[:file_encoding] =~ /utf-8/i) && (fh.respond_to?(:external_encoding) && fh.external_encoding != Encoding.find('UTF-8') || fh.respond_to?(:encoding) && fh.encoding != Encoding.find('UTF-8'))
puts 'WARNING: you are trying to process UTF-8 input, but did not open the input with "b:utf-8" option. See README file "NOTES about File Encodings".'
end

skip_lines(fh, options)

@headers, header_size = process_headers(fh, options)
Expand Down