From 5d55b7c16c1575bd8612efa69d6a276044311b7a Mon Sep 17 00:00:00 2001 From: Noutbuk Date: Wed, 3 Apr 2024 22:10:54 +0200 Subject: [PATCH] add custom --- .dockerignore | 6 ++++- .gitignore | 4 ++-- docker-compose.yaml | 2 +- main.py | 4 ---- src/scrape_heading_task.py | 47 +++++++++++++++++++++++++++++--------- 5 files changed, 44 insertions(+), 19 deletions(-) delete mode 100644 main.py diff --git a/.dockerignore b/.dockerignore index 16d1f04..3db445f 100644 --- a/.dockerignore +++ b/.dockerignore @@ -27,4 +27,8 @@ build/ /error_logs/ -index.http \ No newline at end of file +index.http + +test.html +test.json +test.js diff --git a/.gitignore b/.gitignore index 16d1f04..08a17c9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ - +test.* __pycache__/ *.py[cod] @@ -27,4 +27,4 @@ build/ /error_logs/ -index.http \ No newline at end of file +index.http diff --git a/docker-compose.yaml b/docker-compose.yaml index 9e71ddb..f2ae39c 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -12,7 +12,7 @@ services: - ./profiles:/app/profiles - ./profiles.json:/app/profiles.json - ./local_storage.json:/app/local_storage.json - - ./db.sqlite3:/app/db.sqlite3 + # - ./db.sqlite3:/app/db.sqlite3 ports: - "3000:3000" - "8000:8000" diff --git a/main.py b/main.py deleted file mode 100644 index dbe49a3..0000000 --- a/main.py +++ /dev/null @@ -1,4 +0,0 @@ -from src.scrape_heading_task import scrape_heading_task - -if __name__ == "__main__": - scrape_heading_task({"link":"https://www.omkar.cloud/"}) \ No newline at end of file diff --git a/src/scrape_heading_task.py b/src/scrape_heading_task.py index f9f3c99..6f9a738 100644 --- a/src/scrape_heading_task.py +++ b/src/scrape_heading_task.py @@ -1,16 +1,41 @@ +import time from botasaurus import * +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC -@request -def scrape_heading_task(request: AntiDetectRequests, data): - link = data["link"] - # Navigate to the Omkar Cloud website - soup = request.bs4(link) - - # Retrieve the heading element's text - heading = soup.find('h1').get_text() +@browser +def scrape_heading_task(driver: AntiDetectDriver, data): + status = 200 + info = "success" + driver.get(data["link"]) + + WebDriverWait(driver, 5).until( + EC.presence_of_element_located((By.ID, "cmpwrapper"))) + cookie_button = WebDriverWait(driver, 5).until(EC.visibility_of((driver.execute_script( + "return document.getElementById('cmpwrapper').shadowRoot.getElementById('cmpbox').querySelector('a.cmpboxbtnyes')")))) + cookie_button.click() + + time.sleep(2) + wait = WebDriverWait(driver, 10) + button = wait.until(EC.presence_of_element_located(( + By.XPATH, + "//button[contains(@class, 'BotProtectionCard-Button') and not(contains(@class, 'hidden'))]", + ))) + if button: + button.click() + try: + wait.until(EC.presence_of_element_located( + (By.CSS_SELECTOR, "div.P24BotProtectionModal.hidden"))) + except: + driver.save_screenshot("screenshot.png") + status = 500 + info = "Bot protection modal not hidden" - # Save the data as a JSON file in output/scrape_heading_task.json return { - "heading": heading + "page_source": driver.page_source, + "url": driver.current_url, + "cookies": driver.get_cookies(), + "status": status, + "info": info, } - \ No newline at end of file