Skip to content

Commit

Permalink
Fixed the headless mode bugs
Browse files Browse the repository at this point in the history
  • Loading branch information
dev-aravind committed Oct 23, 2024
1 parent c0cf984 commit d43fe25
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 3 deletions.
30 changes: 30 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,33 @@
FROM ruby:3.1.2

RUN apt-get update && apt-get install -y \
curl \
apt-transport-https \
ca-certificates \
libx11-xcb1 \
libxcomposite1 \
libxcursor1 \
libxdamage1 \
libxi6 \
libxtst6 \
libnss3 \
libxrandr2 \
libasound2 \
libpangocairo-1.0-0 \
libatk1.0-0 \
libcups2 \
libgbm1 \
libpangoft2-1.0-0 \
libjpeg-dev \
libxshmfence1 \
libgles2-mesa \
xvfb \
--no-install-recommends && \
curl -sSL https://dl.google.com/linux/linux_signing_key.pub | apt-key add - && \
echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list && \
apt-get update && apt-get install -y google-chrome-stable --no-install-recommends && \
rm -rf /var/lib/apt/lists/*

WORKDIR /usr/src/app

COPY Gemfile Gemfile.lock ./
Expand All @@ -8,4 +36,6 @@ RUN bundle install

COPY . .

ENV FERRUM_CHROME_PATH=/usr/bin/google-chrome

ENTRYPOINT ["bundle", "exec", "ruby", "src/main.rb"]
2 changes: 1 addition & 1 deletion src/lib/entity_fetcher.rb
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def self.fetch_entity_urls(page_url, entity_identifier, is_paginated)
number_of_entities = entity_urls.length
entities_data.each do |entity|
href = entity["href"]
entity_urls << (href.start_with?('http') ? href : base_url + href)
entity_urls << (href.start_with?('http') ? href : base_url + (href.start_with?('/') ? href : "/#{href}"))
end

break if entity_urls.length == number_of_entities || page_number.nil?
Expand Down
3 changes: 2 additions & 1 deletion src/lib/headless_browser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@

module HeadlessBrowser
def self.fetch_json_ld_objects(entity_urls, base_url)
browser = Ferrum::Browser.new(headless: true, pending_connection_errors: false)
puts "Loading browser..."
browser = Ferrum::Browser.new(headless: true, pending_connection_errors: false, process_timeout: 60, xvfb: true, browser_options: { 'no-sandbox': nil })
graph = RDF::Graph.new
add_url_sparql_file = File.read('./sparql/add_derived_from.sparql')
entity_urls.each do |entity_url|
Expand Down
4 changes: 3 additions & 1 deletion src/lib/rdf_processor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@ def self.process_rdf(entity_urls, base_url)
begin
puts "Processing #{entity_url} in non-headless mode"
entity_url = entity_url.gsub(' ', '+')
loaded_graph = RDF::Graph.load(entity_url)
linkeddata_version = Gem::Specification.find_by_name('linkeddata').version.to_s
options = { headers: { 'User-Agent' => "artsdata-crawler/#{linkeddata_version}" } }
loaded_graph = RDF::Graph.load(entity_url, **options)
sparql_file_with_url = add_url_sparql_file.gsub("subject_url", entity_url)
loaded_graph.query(SPARQL.parse(sparql_file_with_url, update: true))
graph << loaded_graph
Expand Down

0 comments on commit d43fe25

Please sign in to comment.