diff --git a/Dockerfile b/Dockerfile index e61cb27..a0067d1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -26,13 +26,13 @@ ENV TZ "US/Eastern" RUN echo "US/Eastern" | tee /etc/timezone RUN dpkg-reconfigure --frontend noninteractive tzdata -# Install utilities +## Install utilities RUN apt-get -yqq install ca-certificates curl dnsutils man openssl unzip wget -# Install xvfb and fonts +## Install xvfb and fonts RUN apt-get -yqq install xvfb fonts-ipafont-gothic xfonts-100dpi xfonts-75dpi xfonts-scalable xfonts-cyrillic -# Install Fluxbox (window manager) +## Install Fluxbox (window manager) RUN apt-get -yqq install fluxbox # Install VNC @@ -136,4 +136,5 @@ EXPOSE 4444 5900 ENV DISPLAY=:99 WORKDIR /app -CMD ["/app/make.sh", "entrypoint"] +CMD ["fish"] +#CMD ["/app/make.sh", "entrypoint"] diff --git a/Hub/example-config.toml b/Hub/example-config.toml new file mode 100644 index 0000000..b17326f --- /dev/null +++ b/Hub/example-config.toml @@ -0,0 +1,3 @@ +[router] +username = "admin" +password = "admin" diff --git a/bazos/__init__.py b/bazos/__init__.py index 309d049..7ecbfff 100644 --- a/bazos/__init__.py +++ b/bazos/__init__.py @@ -2,8 +2,9 @@ from pathlib import Path import sys from typing import Dict, Any +from distutils.util import strtobool # noqa -from bazos.main import BazosScrapper, BazosUser +from bazos.scrapper import BazosScrapper, BazosUser, BazosDriver __version__ = "0.1.0" __apiversion__ = "0.1.0" @@ -13,31 +14,34 @@ def parse_cli_argument() -> Dict[str, Any]: parser = argparse.ArgumentParser() + BOOL_AS_STR_ARGUMENTS_for_parser_add_argument = dict( + type=lambda x: bool(strtobool(x)), default=False, nargs="?", const=True + ) # true/false parser.add_argument('--login', - action='store_true', + **BOOL_AS_STR_ARGUMENTS_for_parser_add_argument, help='Login to bazos') parser.add_argument('--bazos', - action='store_true', + **BOOL_AS_STR_ARGUMENTS_for_parser_add_argument, help='Use bazos') parser.add_argument('--add-only', - action='store_true', + **BOOL_AS_STR_ARGUMENTS_for_parser_add_argument, help='Add only new products, not remove old ones') parser.add_argument('--print-rubrics', - action='store_true', + **BOOL_AS_STR_ARGUMENTS_for_parser_add_argument, help='Print rubrics') parser.add_argument("--verbose", - action='store_true', - default=True, + **BOOL_AS_STR_ARGUMENTS_for_parser_add_argument, help='Verbose') parser.add_argument("--delete-all", - action='store_true', - default=True, + **BOOL_AS_STR_ARGUMENTS_for_parser_add_argument, help='Verbose') parser.add_argument("--create-all", - action='store_true', - default=True, + **BOOL_AS_STR_ARGUMENTS_for_parser_add_argument, help='Verbose') + parser.add_argument('--remote', + **BOOL_AS_STR_ARGUMENTS_for_parser_add_argument, + help='Use remote') # ? parser.add_argument('--items-path', type=Path, @@ -54,59 +58,58 @@ def parse_cli_argument() -> Dict[str, Any]: nargs="+", help="What bazos country to use", default=['cz', 'sk']) - cli_args = vars(parser.parse_args()) - return cli_args + args = vars(parser.parse_args()) + return args def main(): - cli_args = parse_cli_argument() + args = parse_cli_argument() - if cli_args['verbose']: - # print(f"==> CLI args: {cli_args}") + # Print arguments + if args['verbose']: print(' '.join(sys.argv)) - if cli_args['login']: - bazos_user = BazosUser( - country='cz', items_path=cli_args['items_path'], - credentials_path=cli_args['credentials_path'] - ) + # Driver + bazos_driver = BazosDriver(args=args, country='cz') + + # Login + if args['login']: + bazos_user = BazosUser(country='cz', args=args, driver=bazos_driver.driver) bazos_user.authenticate() bazos_user.save_user_credentials() + else: + bazos_user = BazosUser(country='cz', args=args, driver=bazos_driver.driver) + bazos_user.exists_user_credentials() + + # Rubrics + if args['print_rubrics']: + for country in args['country']: + bazos_user = BazosUser(country=country, args=args, driver=bazos_driver.driver) + bazos_scrapper = BazosScrapper(country=country, args=args, user=bazos_user, driver=bazos_driver.driver) + bazos_scrapper.load_page_with_cookies() + bazos_scrapper.print_all_rubrics_and_categories() - if cli_args['bazos']: - for country in cli_args['country']: - if cli_args['verbose']: + # Bazos + if args['bazos']: + for country in args['country']: + bazos_user = BazosUser(country=country, args=args, driver=bazos_driver.driver) + + if args['verbose']: print(f"==> Processing country: {country}") - # Check if user credentials exists - user = BazosUser( - country='cz', items_path=cli_args['items_path'], - credentials_path=cli_args['credentials_path'] - ) - try: - user.exists_user_credentials(raise_exception=True) - except FileNotFoundError as e: - print(e) - sys.exit(1) - - if cli_args['print_rubrics']: - bazos_scrapper = BazosScrapper(country=country, cli_args=cli_args, user=user) - # bazos_scrapper.check_user_files_available() - bazos_scrapper.load_page_with_cookies() - # bazos_scrapper.check_authentication() - bazos_scrapper.print_all_rubrics_and_categories() - return - - bazos_scrapper = BazosScrapper(country=country, cli_args=cli_args, user=user) + bazos_scrapper = BazosScrapper(country=country, args=args, user=bazos_user, driver=bazos_driver.driver) bazos_scrapper.load_page_with_cookies() # Restore advertisements - if cli_args['delete_all']: + if args['delete_all']: bazos_scrapper.delete_advertisements() - if cli_args['create_all']: + if args['create_all']: bazos_scrapper.create_advertisements() sys.exit() if __name__ == '__main__': + from dotenv import load_dotenv + + load_dotenv(dotenv_path='.env') main() diff --git a/bazos/main.py b/bazos/scrapper.py similarity index 72% rename from bazos/main.py rename to bazos/scrapper.py index 981ef49..7ee269d 100644 --- a/bazos/main.py +++ b/bazos/scrapper.py @@ -1,17 +1,15 @@ -from webdriver_manager.chrome import ChromeDriverManager -from selenium.webdriver.chrome.options import Options -from pathlib import Path import os import pickle # nosec import sys from os import path -from dotenv import load_dotenv from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.by import By from selenium.webdriver.remote.webelement import WebElement from selenium.webdriver.support.ui import Select +from webdriver_manager.chrome import ChromeDriverManager +from selenium.webdriver.chrome.options import Options from bazos.core import settings from bazos.info.product import Product, get_all_products @@ -19,8 +17,6 @@ from bazos.info.rubric_category import get_rubric, get_category from bazos.shared.utils import parse_yaml -load_dotenv() - ################################################################################ # BUG: Some images are rotated, when you upload them to bazos @@ -50,7 +46,8 @@ class XPathsBazos: class BazosDriver: - def __init__(self, country: str): + def __init__(self, country: str, args: dict): + self.args = args self.country = country self.bazos_base_url = f"https://bazos.{country}" self.bazos_moje_inzeraty_url = path.join(self.bazos_base_url, 'moje-inzeraty.php') @@ -61,12 +58,12 @@ def set_chrome_options(self) -> Options: Chrome options for headless browser is enabled. """ chrome_options = Options() - chrome_options.add_argument("--headless") - chrome_options.add_argument("--no-sandbox") - chrome_options.add_argument("--disable-dev-shm-usage") - chrome_prefs = {} - chrome_options.experimental_options["prefs"] = chrome_prefs - chrome_prefs["profile.default_content_settings"] = {"images": 2} + # chrome_options.add_argument("--headless") + # chrome_options.add_argument("--no-sandbox") + # chrome_options.add_argument("--disable-dev-shm-usage") + # chrome_prefs = {} + # chrome_options.experimental_options["prefs"] = chrome_prefs + # chrome_prefs["profile.default_content_settings"] = {"images": 2} # options.binary_location = '/usr/bin/google-chrome' # options.add_argument('--headless') # options.add_argument('--no-sandbox') @@ -79,44 +76,53 @@ def set_chrome_options(self) -> Options: # options.add_argument('--user-agent={}'.format(random.choice(list(self.user_agents)))) return chrome_options - def get_driver(self): + def get_driver(self) -> webdriver.Chrome | webdriver.Remote: options = self.set_chrome_options() - webdriver_manager = ChromeDriverManager().install() - service = Service(executable_path=webdriver_manager) - driver = webdriver.Chrome(service=service, options=options) - return driver + if self.args['remote']: + options = webdriver.ChromeOptions() + options.add_experimental_option("detach", True) + server = "http://localhost:4444" # TODO: Add to args or .env + driver = webdriver.Remote(command_executor=server, options=options) + return driver + else: + webdriver_manager = ChromeDriverManager().install() + service = Service(executable_path=webdriver_manager) + driver = webdriver.Chrome(service=service, options=options) + return driver def __del__(self): - self.driver.close() + self.driver.quit() -class BazosUser(BazosDriver): - def __init__(self, country: str, items_path: Path, credentials_path: Path): - super().__init__(country) - self.items_path = items_path - self.credentaials_path = credentials_path +class BazosUser: + def __init__(self, country: str, args: dict, driver: webdriver.Remote | webdriver.Chrome): + # super().__init__(country, args) + self.driver = driver + self.args = args + self.country = country self.required_keys = ['name', 'phone_number', 'email', 'password', 'psc'] try: - for k, v in parse_yaml(filename=path.join(items_path, f"user_{country}.yml")).items(): + yaml_parsed = parse_yaml(filename=path.join(self.args['items_path'], f"user_{country}.yml")) + for k, v in yaml_parsed.items(): setattr(self, k, v) # Check that all keys are present if not all([hasattr(self, k) for k in self.required_keys]): raise KeyError( - 'Some of keys are missing, check `user_{country}.yml` file, ' + 'Some keys are missing, check `user_{country}.yml` file, ' 'if all keys are present (name, phone_number, email, password, psc)' ) except KeyError as e: print(f"KeyError: {e}") - print(f"Please provide correct `user_{country}.yml` file in `{items_path}`") + print(f"Please provide correct `user_{country}.yml` file in `{self.args['items_path']}`") sys.exit(1) except FileNotFoundError: - print(f"FileNotFoundError: Please provide `user_{country}.yml` file in `{items_path}`") + print(f"FileNotFoundError: Please provide `user_{country}.yml` file in `{self.args['items_path']}`") sys.exit(1) def exists_user_credentials(self, raise_exception: bool = False) -> None: - cookies_file = self.credentaials_path / f"{settings.COOKIES_FILE}_{self.country}.pkl" - local_storage_file = self.credentaials_path / f"{settings.LOCAL_STORAGE_FILE}_{self.country}.pkl" + cookies_file = self.args["credentials_path"] / f"{settings.COOKIES_FILE}_{self.country}.pkl" + local_storage_file = self.args["credentials_path"] / f"{settings.LOCAL_STORAGE_FILE}_{self.country}.pkl" if not os.path.isfile(cookies_file or not os.path.isfile(local_storage_file)): # nosec if raise_exception: raise FileNotFoundError("User files not found, please login - login flag: --login") @@ -143,29 +149,24 @@ def authenticate(self) -> None: self.driver.find_element(By.XPATH, XPathsBazos.auth_code_submit).click() # Submit def save_user_credentials(self) -> None: - cookies_file = self.credentaials_path / f"{settings.COOKIES_FILE}_{self.country}.pkl" - local_storage_file = self.credentaials_path / f"{settings.LOCAL_STORAGE_FILE}_{self.country}.pkl" + cookies_file = self.args["credentials_path"] / f"{settings.COOKIES_FILE}_{self.country}.pkl" + local_storage_file = self.args["credentials_path"] / f"{settings.LOCAL_STORAGE_FILE}_{self.country}.pkl" cookies = self.driver.get_cookies() local_storage = self.driver.execute_script("return window.localStorage;") pickle.dump(cookies, file=open(cookies_file.__str__(), "wb")) # nosec pickle.dump(local_storage, file=open(local_storage_file, "wb")) # nosec -class BazosScrapper(BazosDriver): - def __init__(self, country: str, cli_args: dict, user: BazosUser): - super().__init__(country) - self.cli_args = cli_args - options = self.set_chrome_options() - webdriver_manager = ChromeDriverManager().install() - service = Service(executable_path=webdriver_manager) - self.driver = webdriver.Chrome(service=service, options=options) - - # URLs - self.user = user +class BazosScrapper: + def __init__(self, country: str, args: dict, user: BazosUser, driver: webdriver.Remote | webdriver.Chrome): + # super().__init__(country, args) + self.driver = driver + self.args = args self.country = country + self.bazos_base_url = f"https://bazos.{country}" + self.bazos_moje_inzeraty_url = path.join(self.bazos_base_url, 'moje-inzeraty.php') + self.user = user self.advertisements: int - self.url_bazos = f"https://bazos.{country}" - self.url_moje_inzeraty = path.join(self.url_bazos, 'moje-inzeraty.php') def print_all_rubrics_and_categories(self): self.driver.find_element( @@ -195,17 +196,16 @@ def print_all_rubrics_and_categories(self): print(_dict) def load_page_with_cookies(self) -> None: - self.driver.get(self.url_moje_inzeraty) - cookies_file = self.user.credentaials_path / f"{settings.COOKIES_FILE}_{self.country}.pkl" + self.driver.get(self.bazos_moje_inzeraty_url) + cookies_file = self.args["credentials_path"] / f"{settings.COOKIES_FILE}_{self.country}.pkl" for cookie_dict in pickle.load(open(cookies_file, 'rb')): # nosec self.driver.add_cookie(cookie_dict) - self.driver.get(self.url_moje_inzeraty) + self.driver.get(self.bazos_moje_inzeraty_url) def remove_advertisment(self): - self.driver.find_element(By.CLASS_NAME, 'inzeratydetdel').find_element( - By.TAG_NAME, 'a').click() - pwd_input = self.driver.find_element( - By.XPATH, XPathsBazos.delete_pwd_input) + del_btn = self.driver.find_element(By.CLASS_NAME, 'inzeratydetdel').find_element(By.TAG_NAME, 'a') + del_btn.click() + pwd_input = self.driver.find_element(By.XPATH, XPathsBazos.delete_pwd_input) pwd_input.clear() pwd_input.send_keys(getattr(self.user, 'password')) self.driver.find_element(By.XPATH, XPathsBazos.delete_submit).click() # Submit-Delete @@ -213,11 +213,11 @@ def remove_advertisment(self): def delete_advertisements(self): self.advertisements = len(self.driver.find_elements(By.CLASS_NAME, 'nadpis')) - if self.cli_args['verbose']: + if self.args['verbose']: print("==> Removing old advertisements") for i in range(self.advertisements): element = self.driver.find_element(By.CLASS_NAME, 'nadpis') - if self.cli_args['verbose']: + if self.args['verbose']: print(f"Removing[{i}/{self.advertisements}]: {element.text}") # @@ -229,21 +229,16 @@ def delete_advertisements(self): def add_advertisement(self, product: Product): # Rubrik - select_rubrik = Select(self.driver.find_element( - By.XPATH, XPathsBazos.product_rubric)) - select_rubrik.select_by_visible_text( - get_rubric(self.country, product.rubric)) + select_rubrik = Select(self.driver.find_element(By.XPATH, XPathsBazos.product_rubric)) + select_rubrik.select_by_visible_text(get_rubric(self.country, product.rubric)) # Product - select_category = Select(self.driver.find_element( - By.XPATH, XPathsBazos.product_category)) - select_category.select_by_visible_text(get_category( - self.country, product.rubric, product.category)) + select_category = Select(self.driver.find_element(By.XPATH, XPathsBazos.product_category)) + select_category.select_by_visible_text(get_category(self.country, product.rubric, product.category)) wait_random_time() self.driver.find_element(By.ID, 'nadpis').send_keys(product.title) self.driver.find_element(By.ID, 'popis').send_keys(product.description) - self.driver.find_element(By.ID, 'cena').send_keys( - product.get_location_price(self.country)) + self.driver.find_element(By.ID, 'cena').send_keys(product.get_location_price(self.country)) wait_random_time() self.driver.find_element(By.ID, 'lokalita').clear() @@ -251,8 +246,7 @@ def add_advertisement(self, product: Product): self.driver.find_element(By.ID, 'jmeno').clear() self.driver.find_element(By.ID, 'jmeno').send_keys(getattr(self.user, 'name')) self.driver.find_element(By.ID, 'telefoni').clear() - self.driver.find_element( - By.ID, 'telefoni').send_keys(getattr(self.user, 'phone_number')) + self.driver.find_element(By.ID, 'telefoni').send_keys(getattr(self.user, 'phone_number')) self.driver.find_element(By.ID, 'maili').clear() self.driver.find_element(By.ID, 'maili').send_keys(getattr(self.user, 'email')) self.driver.find_element(By.ID, 'heslobazar').clear() @@ -266,19 +260,19 @@ def add_advertisement(self, product: Product): self.driver.find_element(By.XPATH, XPathsBazos.product_submit).click() def create_advertisements(self) -> None: - products = get_all_products(products_path=getattr(self, "items_path"), country=self.country) + products = get_all_products(products_path=self.args["items_path"], country=self.country) self.advertisements = len(products) - if self.cli_args['verbose']: + if self.args['verbose']: print("==> Adding advertisements") for idx, product in enumerate(products): if self.product_already_advertised(product): - if self.cli_args['verbose']: + if self.args['verbose']: print(f"Skipping[{idx}/{self.advertisements}]: {product.product_path}") continue - if self.cli_args['verbose']: + if self.args['verbose']: print(f"Adding[{idx}/{self.advertisements}]: {product.product_path}") # product not advertised ADD them diff --git a/docker-compose.yml b/docker-compose.yml index d92fd21..d1fa12e 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -3,8 +3,8 @@ version: '3.6' name: bazos-api services: - bazos-api: - container_name: ${COMPOSE_PROJECT_NAME} + bazos: + container_name: ${COMPOSE_PROJECT_NAME}-bazos # Here must be defined both image and build, because we use different .env files with which are the images build image: zdeneklapes/${COMPOSE_PROJECT_NAME}:latest build: @@ -15,19 +15,36 @@ services: - ./tmp/fish/:/root/.local/share/fish/ # Mount the fish shell history, to remain files when docker container is rebuild: This will create ./tmp/fish/ folder in the project directory if it doesn't exist already - ./tmp/fish/:/home/user1/.local/share/fish/ # Mount the fish shell history, to remain files when docker container is rebuild: This will create ./tmp/fish/ folder in the project directory if it doesn't exist already - $HOME/Documents/photos-archive/bazos:/tmp/images/ - ports: - - 4444:4444 - - 8090:8090 - - 9050:9050 stdin_open: true tty: true depends_on: - - selenium-chrome - selenium-chrome: + - selenium +# env_file: +# - .env + environment: + - SELENIUM_URL='http://selenium:4444' + selenium: + container_name: ${COMPOSE_PROJECT_NAME}-selenium image: selenium/standalone-chrome:latest ports: - "4444:4444" - "7900:7900" shm_size: "2g" - container_name: selenium-chrome-container - restart: unless-stopped + chrome: + image: selenium/node-chrome:4.16.1-20231219 + shm_size: 2gb + depends_on: + - selenium-hub + environment: + - SE_EVENT_BUS_HOST=selenium-hub + - SE_EVENT_BUS_PUBLISH_PORT=4442 + - SE_EVENT_BUS_SUBSCRIBE_PORT=4443 + selenium-hub: + image: selenium/hub:4.16.1-20231219 + container_name: selenium-hub + ports: + - "4442:4442" + - "4443:4443" + - "4444:4444" +# volumes: +# - ./Hub/example-config.toml:/opt/selenium/config.toml diff --git a/requirements.txt b/requirements.txt index c168f2c..8ee0f74 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,32 +1,13 @@ -attrs==23.1.0 -certifi==2023.11.17 -chardet==3.0.4 -charset-normalizer==3.3.2 -CurrencyConverter==0.17.14 -dbus-python==1.2.16 -distro-info==0.23+ubuntu1.1 -exceptiongroup==1.2.0 -h11==0.14.0 -idna==2.8 -outcome==1.3.0.post0 -packaging==23.2 +bandit==1.7.5 +bazos==0.1.0 +colorama==0.4.6 +currencyconverter==0.17.9 +forex-python==1.8 +pip-autoremove==0.10.0 pip-chill==1.0.3 -pycairo==1.25.1 -PyGObject==3.36.0 -PySocks==1.7.1 -python-apt==2.0.1+ubuntu0.20.4.1 -python-dotenv==1.0.0 -PyYAML==6.0.1 -requests==2.31.0 -requests-unixsocket==0.2.0 -selenium==4.16.0 -six==1.14.0 -sniffio==1.3.0 -sortedcontainers==2.4.0 -supervisor==4.1.0 -trio==0.23.2 -trio-websocket==0.11.1 -unattended-upgrades==0.1 -urllib3==2.1.0 +pysocks==1.7.1 +selenium==4.10.0 +tomli==2.0.1 +tqdm==4.66.1 +versioneer==0.29 webdriver-manager==4.0.1 -wsproto==1.2.0