diff --git a/lib/zorki/scrapers/scraper.rb b/lib/zorki/scrapers/scraper.rb index d415312..6063c6a 100644 --- a/lib/zorki/scrapers/scraper.rb +++ b/lib/zorki/scrapers/scraper.rb @@ -73,6 +73,42 @@ def get_content_of_subpage_from_url(url, subpage_search) Oj.load(response_body) end + # Instagram uses GraphQL (like most of Facebook I think), and returns an object that actually + # is used to seed the page. We can just parse this for most things. + # + # @returns Hash a ruby hash of the JSON data + def find_graphql_script + scripts = all("script", visible: false) + # We search for a quoted term to find a JSON string that uses "graphql" as a key + # graphql_script = scripts.find { |s| s.text(:all).include?('"graphql"') } + # Let's look around if you can't find it in the previous line + graphql_script = scripts.find { |s| s.text(:all).include?("followed_by_viewer") } + + graphql_text = graphql_script.text(:all) + + # Clean up the javascript so we have pure JSON + # We do this by scanning until we get to the first `{`, taking the subindex, then doing the + # same backwards to find `}` + index = graphql_text.index("{") + # graphql_text = graphql_text[index...] + + # We now do it again, due to some javascript being tossed in + index = graphql_text.index("{", index + 1) + index = graphql_text.index("{", index + 1) + + graphql_text = graphql_text[index...] + + graphql_text = graphql_text.reverse + index = graphql_text.index("}") + index = graphql_text.index("}", index + 1) + index = graphql_text.index("}", index + 1) + + graphql_text = graphql_text[index..] # this is not inclusive on purpose + graphql_text = graphql_text.reverse + + Oj.load(graphql_text) + end + private def login diff --git a/lib/zorki/scrapers/user_scraper.rb b/lib/zorki/scrapers/user_scraper.rb index ac3bfba..92c5791 100644 --- a/lib/zorki/scrapers/user_scraper.rb +++ b/lib/zorki/scrapers/user_scraper.rb @@ -18,7 +18,13 @@ def parse(username) # - *Profile image login - graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "?username=") + url = "https://instagram.com/#{username}/" + if check_if_content_preloaded?(url) + graphql_script = find_graphql_script + else + graphql_script = get_content_of_subpage_from_url(url, "?username=") + end + user = graphql_script["data"]["user"] # Get the username (to verify we're on the right page here) @@ -39,5 +45,12 @@ def parse(username) profile_image_url: profile_image_url } end + + private + + def check_if_content_preloaded?(url) + visit(url) + page.html.include? "followed_by_viewer" + end end end