From 5d55b7c16c1575bd8612efa69d6a276044311b7a Mon Sep 17 00:00:00 2001
From: Noutbuk <github@jpautz.de>
Date: Wed, 3 Apr 2024 22:10:54 +0200
Subject: [PATCH] add custom

---
 .dockerignore              |  6 ++++-
 .gitignore                 |  4 ++--
 docker-compose.yaml        |  2 +-
 main.py                    |  4 ----
 src/scrape_heading_task.py | 47 +++++++++++++++++++++++++++++---------
 5 files changed, 44 insertions(+), 19 deletions(-)
 delete mode 100644 main.py

diff --git a/.dockerignore b/.dockerignore
index 16d1f04..3db445f 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -27,4 +27,8 @@ build/
 
 /error_logs/
 
-index.http    
\ No newline at end of file
+index.http
+
+test.html
+test.json
+test.js
diff --git a/.gitignore b/.gitignore
index 16d1f04..08a17c9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,4 @@
-
+test.*
 
 __pycache__/
 *.py[cod]
@@ -27,4 +27,4 @@ build/
 
 /error_logs/
 
-index.http    
\ No newline at end of file
+index.http    
diff --git a/docker-compose.yaml b/docker-compose.yaml
index 9e71ddb..f2ae39c 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -12,7 +12,7 @@ services:
         - ./profiles:/app/profiles
         - ./profiles.json:/app/profiles.json
         - ./local_storage.json:/app/local_storage.json
-        - ./db.sqlite3:/app/db.sqlite3
+        # - ./db.sqlite3:/app/db.sqlite3
       ports:
         - "3000:3000" 
         - "8000:8000" 
diff --git a/main.py b/main.py
deleted file mode 100644
index dbe49a3..0000000
--- a/main.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from src.scrape_heading_task import scrape_heading_task
-     
-if __name__ == "__main__":
-    scrape_heading_task({"link":"https://www.omkar.cloud/"})
\ No newline at end of file
diff --git a/src/scrape_heading_task.py b/src/scrape_heading_task.py
index f9f3c99..6f9a738 100644
--- a/src/scrape_heading_task.py
+++ b/src/scrape_heading_task.py
@@ -1,16 +1,41 @@
+import time
 from botasaurus import *
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
 
-@request
-def scrape_heading_task(request: AntiDetectRequests, data):
-    link = data["link"]
-    # Navigate to the Omkar Cloud website
-    soup = request.bs4(link)
-    
-    # Retrieve the heading element's text
-    heading = soup.find('h1').get_text()
+@browser
+def scrape_heading_task(driver: AntiDetectDriver, data):
+    status = 200
+    info = "success"
+    driver.get(data["link"])
+
+    WebDriverWait(driver, 5).until(
+        EC.presence_of_element_located((By.ID, "cmpwrapper")))
+    cookie_button = WebDriverWait(driver, 5).until(EC.visibility_of((driver.execute_script(
+        "return document.getElementById('cmpwrapper').shadowRoot.getElementById('cmpbox').querySelector('a.cmpboxbtnyes')"))))
+    cookie_button.click()
+
+    time.sleep(2)
+    wait = WebDriverWait(driver, 10)
+    button = wait.until(EC.presence_of_element_located((
+        By.XPATH,
+        "//button[contains(@class, 'BotProtectionCard-Button') and not(contains(@class, 'hidden'))]",
+    )))
+    if button:
+        button.click()
+        try:
+            wait.until(EC.presence_of_element_located(
+                (By.CSS_SELECTOR, "div.P24BotProtectionModal.hidden")))
+        except:
+            driver.save_screenshot("screenshot.png")
+            status = 500
+            info = "Bot protection modal not hidden"
 
-    # Save the data as a JSON file in output/scrape_heading_task.json
     return {
-        "heading": heading
+        "page_source": driver.page_source,
+        "url": driver.current_url,
+        "cookies": driver.get_cookies(),
+        "status": status,
+        "info": info,
     }
-    
\ No newline at end of file