Skip to content

Commit

Permalink
Merge branch 'add-xls-analyzer' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
gbp committed Jan 8, 2024
2 parents dfa118a + 324f495 commit 68cc6df
Show file tree
Hide file tree
Showing 13 changed files with 212 additions and 40 deletions.
1 change: 1 addition & 0 deletions config/packages
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ libicu-dev
libmagic-dev
libmagickwand-dev
libpq-dev
libreoffice-calc-nogui
libsqlite3-dev
libxml2-dev
libxslt1-dev
Expand Down
1 change: 1 addition & 0 deletions config/packages.generic
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ libicu-dev
libmagic-dev
libmagickwand-dev
libpq-dev
libreoffice-calc-nogui
libsqlite3-dev
libxml2-dev
libxslt-dev
Expand Down
2 changes: 1 addition & 1 deletion doc/CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
* Restore delivery status notification attachment note (Gareth Rees)
* Explore CSV files in a Datasette Lite instance (Gareth Rees)
* Add link from incoming message to admin page for attachments (Gareth Rees)
* Add XSLX spreadsheet analyser to automatically detect hidden data (Helen
* Add XLS & XLSX spreadsheet analyser to automatically detect hidden data (Helen
Cross, Graeme Porteous)
* Update attachment processing to automatically rebuild if cached file goes
missing (Graeme Porteous)
Expand Down
3 changes: 2 additions & 1 deletion gems/excel_analyzer/lib/excel_analyzer.rb
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
require "excel_analyzer/analyzer"
require "excel_analyzer/xls_analyzer"
require "excel_analyzer/xlsx_analyzer"
require "excel_analyzer/railtie" if defined?(Rails)
Original file line number Diff line number Diff line change
@@ -1,37 +1,16 @@
require "active_storage"
require "active_storage/analyzer"
require "nokogiri"
require "zip"

module ExcelAnalyzer
##
# The Analyzer class is responsible for analyzing Excel files uploaded through
# Active Storage. It checks for various features within the Excel file such as
# It checks for various features within the Excel (.xlsx) file such as
# hidden rows, columns, sheets, pivot caches, and external links.
#
# The class uses rubyzip and Nokogiri for reading and parsing the contents of
# the Excel (.xlsx) files.
# The module uses rubyzip and Nokogiri for reading and parsing the contents.
#
class Analyzer < ActiveStorage::Analyzer
XLSX_CONTENT_TYPE =
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"

def self.accept?(blob)
blob.content_type == XLSX_CONTENT_TYPE
end

def metadata
{ excel: excel_metadata }
end

private

def excel_metadata
download_blob_to_tempfile(&method(:probe))
end

def probe(tempfile)
Zip::File.open(tempfile.path) do |zip_file|
module Probe
def probe(io)
Zip::File.open(io.path) do |zip_file|
{
pivot_cache: zip_file.glob("xl/pivotCache/*").any?,
external_links: zip_file.glob("xl/externalLinks/*").any?,
Expand All @@ -40,11 +19,10 @@ def probe(tempfile)
hidden_sheets: hidden_sheets?(zip_file)
}
end

rescue StandardError => ex
{ error: ex.message }
end

private

def namespace
{ "ns" => "http://schemas.openxmlformats.org/spreadsheetml/2006/main" }
end
Expand Down
5 changes: 3 additions & 2 deletions gems/excel_analyzer/lib/excel_analyzer/railtie.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@
module ExcelAnalyzer
##
# This Railtie integrates the gem with Rails by extending ActiveStorage's
# Analyzers with the custom ExcelAnalyzer::Analyzer.
# Analyzers.
#
class Railtie < Rails::Railtie
config.active_storage.analyzers.prepend ExcelAnalyzer::Analyzer
config.active_storage.analyzers.prepend ExcelAnalyzer::XlsxAnalyzer
config.active_storage.analyzers.prepend ExcelAnalyzer::XlsAnalyzer
end
end
67 changes: 67 additions & 0 deletions gems/excel_analyzer/lib/excel_analyzer/xls_analyzer.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
require "open3"
require "tempfile"
require "tmpdir"

require "active_storage"
require "active_storage/analyzer"

require "excel_analyzer/probe"

module ExcelAnalyzer
##
# The Analyzer class is responsible for analyzing Excel (.xls) files uploaded
# through Active Storage.
#
# Files are first converted to XLSX format and then probed for hidden data.
#
class XlsAnalyzer < ActiveStorage::Analyzer
include ExcelAnalyzer::Probe

CONTENT_TYPE = "application/vnd.ms-excel"

def self.accept?(blob)
blob.content_type == CONTENT_TYPE
end

def metadata
{ excel: excel_metadata }
end

private

def excel_metadata
download_blob_to_tempfile(&method(:convert_and_probe))
rescue StandardError => ex
{ error: ex.message }
end

def convert_and_probe(io)
probe(convert(io))
end

def convert(io)
raise 'LibreOffice (soffice) command not found' unless soffice_installed?

Dir.mktmpdir do |tmpdir|
_stdout, _stderr, status = Open3.capture3(
"soffice --headless --convert-to xlsx --outdir #{tmpdir} #{io.path}"
)

path = File.join(tmpdir, File.basename(io.path, ".*") + ".xlsx")

if !status.success? || !File.exist?(path)
raise "LibreOffice conversion failed"
end

Tempfile.new.tap do |tempfile|
tempfile.write(File.read(path))
tempfile.rewind
end
end
end

def soffice_installed?
system("which soffice > /dev/null 2>&1")
end
end
end
33 changes: 33 additions & 0 deletions gems/excel_analyzer/lib/excel_analyzer/xlsx_analyzer.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
require "active_storage"
require "active_storage/analyzer"

require "excel_analyzer/probe"

module ExcelAnalyzer
##
# The Analyzer class is responsible for analyzing Excel (.xlsx) files uploaded
# through Active Storage.
#
class XlsxAnalyzer < ActiveStorage::Analyzer
include ExcelAnalyzer::Probe

CONTENT_TYPE =
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"

def self.accept?(blob)
blob.content_type == CONTENT_TYPE
end

def metadata
{ excel: excel_metadata }
end

private

def excel_metadata
download_blob_to_tempfile(&method(:probe))
rescue StandardError => ex
{ error: ex.message }
end
end
end
89 changes: 89 additions & 0 deletions gems/excel_analyzer/spec/excel_analyzer/xls_analyzer_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
# frozen_string_literal: true

require "spec_helper"

RSpec.describe ExcelAnalyzer::XlsAnalyzer do
describe ".accept?" do
subject { ExcelAnalyzer::XlsAnalyzer.accept?(blob) }

context "when the blob is an Excel file" do
let(:blob) do
fake_blob(content_type: ExcelAnalyzer::XlsAnalyzer::CONTENT_TYPE)
end

it { is_expected.to eq true }
end

context "when the blob is not an Excel file" do
let(:blob) { fake_blob(content_type: "text/plain") }
it { is_expected.to eq false }
end
end

describe "#metadata" do
let(:metadata) { ExcelAnalyzer::XlsAnalyzer.new(blob).metadata }

context "when the blob is an Excel file with hidden data" do
let(:blob) do
fake_blob(io: File.open(File.join(__dir__, "../fixtures/suspect.xls")),
content_type: ExcelAnalyzer::XlsAnalyzer::CONTENT_TYPE)
end

it "detects pivot cache" do
expect(metadata[:excel][:pivot_cache]).to eq true
end

it "detects external links" do
expect(metadata[:excel][:external_links]).to eq true
end

it "detects hidden rows" do
expect(metadata[:excel][:hidden_rows]).to eq true
end

it "detects hidden columns" do
expect(metadata[:excel][:hidden_columns]).to eq true
end

it "detects hidden sheets" do
expect(metadata[:excel][:hidden_sheets]).to eq true
end
end

context "when the blob is an Excel file without hidden data" do
let(:blob) do
fake_blob(io: File.open(File.join(__dir__, "../fixtures/data.xls")),
content_type: ExcelAnalyzer::XlsAnalyzer::CONTENT_TYPE)
end

it "does not detect hidden data" do
expect(metadata[:excel]).to eq(
pivot_cache: false,
external_links: false,
hidden_rows: false,
hidden_columns: false,
hidden_sheets: false
)
end
end

context "when the blob is not an Excel file" do
let(:blob) do
fake_blob(io: File.open(File.join(__dir__, "../fixtures/plain.txt")),
content_type: "text/plain")
end

it "returns an error metadata" do
expect(metadata[:excel]).to eq(error: "LibreOffice conversion failed")
end
end
end

private

def fake_blob(io: nil, content_type:)
dbl = double(content_type: content_type)
allow(dbl).to receive(:open).and_yield(io)
dbl
end
end
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@

require "spec_helper"

RSpec.describe ExcelAnalyzer::Analyzer do
RSpec.describe ExcelAnalyzer::XlsxAnalyzer do
describe ".accept?" do
subject { ExcelAnalyzer::Analyzer.accept?(blob) }
subject { ExcelAnalyzer::XlsxAnalyzer.accept?(blob) }

context "when the blob is an Excel file" do
let(:blob) do
fake_blob(content_type: ExcelAnalyzer::Analyzer::XLSX_CONTENT_TYPE)
fake_blob(content_type: ExcelAnalyzer::XlsxAnalyzer::CONTENT_TYPE)
end

it { is_expected.to eq true }
Expand All @@ -21,12 +21,12 @@
end

describe "#metadata" do
let(:metadata) { ExcelAnalyzer::Analyzer.new(blob).metadata }
let(:metadata) { ExcelAnalyzer::XlsxAnalyzer.new(blob).metadata }

context "when the blob is an Excel file with hidden data" do
let(:blob) do
fake_blob(io: File.open(File.join(__dir__, "../fixtures/suspect.xlsx")),
content_type: ExcelAnalyzer::Analyzer::XLSX_CONTENT_TYPE)
content_type: ExcelAnalyzer::XlsxAnalyzer::CONTENT_TYPE)
end

it "detects pivot cache" do
Expand All @@ -53,7 +53,7 @@
context "when the blob is an Excel file without hidden data" do
let(:blob) do
fake_blob(io: File.open(File.join(__dir__, "../fixtures/data.xlsx")),
content_type: ExcelAnalyzer::Analyzer::XLSX_CONTENT_TYPE)
content_type: ExcelAnalyzer::XlsxAnalyzer::CONTENT_TYPE)
end

it "does not detect hidden data" do
Expand Down
Binary file added gems/excel_analyzer/spec/fixtures/data.xls
Binary file not shown.
Binary file added gems/excel_analyzer/spec/fixtures/suspect.xls
Binary file not shown.
3 changes: 2 additions & 1 deletion gems/excel_analyzer/spec/spec_helper.rb
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# frozen_string_literal: true

require "bundler/setup"
require "excel_analyzer/analyzer"
require "excel_analyzer/xls_analyzer"
require "excel_analyzer/xlsx_analyzer"

RSpec.configure do |config|
# Enable flags like --only-failures and --next-failure
Expand Down

0 comments on commit 68cc6df

Please sign in to comment.