Skip to content

Commit

Permalink
add custom
Browse files Browse the repository at this point in the history
  • Loading branch information
Noutbuk committed Apr 3, 2024
1 parent f838417 commit 5d55b7c
Show file tree
Hide file tree
Showing 5 changed files with 44 additions and 19 deletions.
6 changes: 5 additions & 1 deletion .dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,8 @@ build/

/error_logs/

index.http
index.http

test.html
test.json
test.js
4 changes: 2 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@

test.*

__pycache__/
*.py[cod]
Expand Down Expand Up @@ -27,4 +27,4 @@ build/

/error_logs/

index.http
index.http
2 changes: 1 addition & 1 deletion docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ services:
- ./profiles:/app/profiles
- ./profiles.json:/app/profiles.json
- ./local_storage.json:/app/local_storage.json
- ./db.sqlite3:/app/db.sqlite3
# - ./db.sqlite3:/app/db.sqlite3
ports:
- "3000:3000"
- "8000:8000"
4 changes: 0 additions & 4 deletions main.py

This file was deleted.

47 changes: 36 additions & 11 deletions src/scrape_heading_task.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,41 @@
import time
from botasaurus import *
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

@request
def scrape_heading_task(request: AntiDetectRequests, data):
link = data["link"]
# Navigate to the Omkar Cloud website
soup = request.bs4(link)

# Retrieve the heading element's text
heading = soup.find('h1').get_text()
@browser
def scrape_heading_task(driver: AntiDetectDriver, data):
status = 200
info = "success"
driver.get(data["link"])

WebDriverWait(driver, 5).until(
EC.presence_of_element_located((By.ID, "cmpwrapper")))
cookie_button = WebDriverWait(driver, 5).until(EC.visibility_of((driver.execute_script(
"return document.getElementById('cmpwrapper').shadowRoot.getElementById('cmpbox').querySelector('a.cmpboxbtnyes')"))))
cookie_button.click()

time.sleep(2)
wait = WebDriverWait(driver, 10)
button = wait.until(EC.presence_of_element_located((
By.XPATH,
"//button[contains(@class, 'BotProtectionCard-Button') and not(contains(@class, 'hidden'))]",
)))
if button:
button.click()
try:
wait.until(EC.presence_of_element_located(
(By.CSS_SELECTOR, "div.P24BotProtectionModal.hidden")))
except:
driver.save_screenshot("screenshot.png")
status = 500
info = "Bot protection modal not hidden"

# Save the data as a JSON file in output/scrape_heading_task.json
return {
"heading": heading
"page_source": driver.page_source,
"url": driver.current_url,
"cookies": driver.get_cookies(),
"status": status,
"info": info,
}

0 comments on commit 5d55b7c

Please sign in to comment.