From 7877bcd403bedb30aab9dc61b64a8f3445321697 Mon Sep 17 00:00:00 2001 From: martin-martin Date: Wed, 24 Jul 2024 11:51:20 +0200 Subject: [PATCH 01/12] Add project code --- .../README.md | 41 +++++ .../books/books/__init__.py | 0 .../books/books/items.py | 8 + .../books/books/middlewares.py | 105 ++++++++++++ .../books/books/pipelines.py | 42 +++++ .../books/books/settings.py | 33 ++++ .../books/books/spiders/__init__.py | 4 + .../books/books/spiders/book.py | 44 +++++ .../books/scrapy.cfg | 11 ++ .../books/tests/__init__.py | 0 .../books/tests/sample.html | 155 ++++++++++++++++++ .../books/tests/test_book.py | 75 +++++++++ .../requirements.txt | 40 +++++ 13 files changed, 558 insertions(+) create mode 100644 web-scraping-with-scrapy-and-mongodb/README.md create mode 100644 web-scraping-with-scrapy-and-mongodb/books/books/__init__.py create mode 100644 web-scraping-with-scrapy-and-mongodb/books/books/items.py create mode 100644 web-scraping-with-scrapy-and-mongodb/books/books/middlewares.py create mode 100644 web-scraping-with-scrapy-and-mongodb/books/books/pipelines.py create mode 100644 web-scraping-with-scrapy-and-mongodb/books/books/settings.py create mode 100644 web-scraping-with-scrapy-and-mongodb/books/books/spiders/__init__.py create mode 100644 web-scraping-with-scrapy-and-mongodb/books/books/spiders/book.py create mode 100644 web-scraping-with-scrapy-and-mongodb/books/scrapy.cfg create mode 100644 web-scraping-with-scrapy-and-mongodb/books/tests/__init__.py create mode 100644 web-scraping-with-scrapy-and-mongodb/books/tests/sample.html create mode 100644 web-scraping-with-scrapy-and-mongodb/books/tests/test_book.py create mode 100644 web-scraping-with-scrapy-and-mongodb/requirements.txt diff --git a/web-scraping-with-scrapy-and-mongodb/README.md b/web-scraping-with-scrapy-and-mongodb/README.md new file mode 100644 index 0000000000..c0dfe9ec75 --- /dev/null +++ b/web-scraping-with-scrapy-and-mongodb/README.md @@ -0,0 +1,41 @@ +# Web Scraping With Scrapy and MongoDB + +[Web Scraping With Scrapy and MongoDB](https://realpython.com/web-scraping-with-scrapy-and-mongodb/) is an example project for building a robust web scraper for static sites leveraging Scrapy and MongoDB. + +## Installation and Setup + +1. Create a Python virtual environment + +```sh +$ python -m venv ./venv +$ source venv/bin/activate +(venv) $ +``` + +2. Install the requirements + +```sh +(venv) $ pip install -r requirements.txt +``` + +You'll also need to [set up a MongoDB collection](https://realpython.com/web-scraping-with-scrapy-and-mongodb/#set-up-a-mongodb-collection-on-your-computer) like described in the tutorial. + +## Run the Scraper + +Navigate into the `books/` project directory. + +Then you can start crawling the site: + +```sh +(venv) $ scrapy crawl book +``` + +If set up correctly, this will populate your MongoDB collection with the book information scraped from the example site. + +## About the Author + +Martin Breuss - Email: martin@realpython.com + +## License + +Distributed under the MIT license. See ``LICENSE`` for more information. diff --git a/web-scraping-with-scrapy-and-mongodb/books/books/__init__.py b/web-scraping-with-scrapy-and-mongodb/books/books/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/web-scraping-with-scrapy-and-mongodb/books/books/items.py b/web-scraping-with-scrapy-and-mongodb/books/books/items.py new file mode 100644 index 0000000000..b7013a9c25 --- /dev/null +++ b/web-scraping-with-scrapy-and-mongodb/books/books/items.py @@ -0,0 +1,8 @@ +import scrapy + + +class BookItem(scrapy.Item): + _id = scrapy.Field() + url = scrapy.Field() + title = scrapy.Field() + price = scrapy.Field() diff --git a/web-scraping-with-scrapy-and-mongodb/books/books/middlewares.py b/web-scraping-with-scrapy-and-mongodb/books/books/middlewares.py new file mode 100644 index 0000000000..51272c689c --- /dev/null +++ b/web-scraping-with-scrapy-and-mongodb/books/books/middlewares.py @@ -0,0 +1,105 @@ +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +# useful for handling different item types with a single interface +from scrapy import signals + + +class BooksSpiderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect( + s.spider_opened, signal=signals.spider_opened + ) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, or item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request or item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info("Spider opened: %s" % spider.name) + + +class BooksDownloaderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect( + s.spider_opened, signal=signals.spider_opened + ) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info("Spider opened: %s" % spider.name) diff --git a/web-scraping-with-scrapy-and-mongodb/books/books/pipelines.py b/web-scraping-with-scrapy-and-mongodb/books/books/pipelines.py new file mode 100644 index 0000000000..590aa8d1a3 --- /dev/null +++ b/web-scraping-with-scrapy-and-mongodb/books/books/pipelines.py @@ -0,0 +1,42 @@ +import hashlib + +import pymongo +from itemadapter import ItemAdapter +from scrapy.exceptions import DropItem + + +class MongoPipeline: + collection_name = "books" + + def __init__(self, mongo_uri, mongo_db): + self.mongo_uri = mongo_uri + self.mongo_db = mongo_db + + @classmethod + def from_crawler(cls, crawler): + return cls( + mongo_uri=crawler.settings.get("MONGO_URI"), + mongo_db=crawler.settings.get("MONGO_DATABASE"), + ) + + def open_spider(self, spider): + self.client = pymongo.MongoClient(self.mongo_uri) + self.db = self.client[self.mongo_db] + + def close_spider(self, spider): + self.client.close() + + def process_item(self, item, spider): + item_id = self.compute_item_id(item) + if self.db[self.collection_name].find_one({"_id": item_id}): + raise DropItem(f"Duplicate item found: {item}") + else: + item["_id"] = item_id + self.db[self.collection_name].insert_one( + ItemAdapter(item).asdict() + ) + return item + + def compute_item_id(self, item): + url = item["url"] + return hashlib.sha256(url.encode("utf-8")).hexdigest() diff --git a/web-scraping-with-scrapy-and-mongodb/books/books/settings.py b/web-scraping-with-scrapy-and-mongodb/books/books/settings.py new file mode 100644 index 0000000000..3825169786 --- /dev/null +++ b/web-scraping-with-scrapy-and-mongodb/books/books/settings.py @@ -0,0 +1,33 @@ +# Scrapy settings for books project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = "books" + +SPIDER_MODULES = ["books.spiders"] +NEWSPIDER_MODULE = "books.spiders" + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + "books.pipelines.MongoPipeline": 300, +} + +# Set settings whose default value is deprecated to a future-proof value +REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7" +TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" +FEED_EXPORT_ENCODING = "utf-8" + +MONGO_URI = "mongodb://localhost:27017" +MONGO_DATABASE = "books_db" + +LOG_LEVEL = "WARNING" +LOG_FILE = "book_scraper.log" diff --git a/web-scraping-with-scrapy-and-mongodb/books/books/spiders/__init__.py b/web-scraping-with-scrapy-and-mongodb/books/books/spiders/__init__.py new file mode 100644 index 0000000000..ebd689ac51 --- /dev/null +++ b/web-scraping-with-scrapy-and-mongodb/books/books/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/web-scraping-with-scrapy-and-mongodb/books/books/spiders/book.py b/web-scraping-with-scrapy-and-mongodb/books/books/spiders/book.py new file mode 100644 index 0000000000..2c0054dc6d --- /dev/null +++ b/web-scraping-with-scrapy-and-mongodb/books/books/spiders/book.py @@ -0,0 +1,44 @@ +import scrapy + +from books.items import BookItem + + +class BookSpider(scrapy.Spider): + name = "book" + allowed_domains = ["books.toscrape.com"] + start_urls = ["https://books.toscrape.com/"] + + def start_requests(self): + for url in self.start_urls: + yield scrapy.Request( + url, callback=self.parse, errback=self.log_error + ) + + def parse(self, response): + """ + @url https://books.toscrape.com + @returns items 20 20 + @returns request 1 50 + @scrapes url title price + """ + for book in response.css("article.product_pod"): + item = BookItem() + item["url"] = book.css("h3 > a::attr(href)").get() + item["title"] = book.css("h3 > a::attr(title)").get() + item["price"] = book.css(".price_color::text").get() + yield item + + next_page = response.css("li.next > a::attr(href)").get() + if next_page: + next_page_url = response.urljoin(next_page) + self.logger.info( + f"Navigating to next page with URL {next_page_url}." + ) + yield scrapy.Request( + url=next_page_url, + callback=self.parse, + errback=self.log_error, + ) + + def log_error(self, failure): + self.logger.error(repr(failure)) diff --git a/web-scraping-with-scrapy-and-mongodb/books/scrapy.cfg b/web-scraping-with-scrapy-and-mongodb/books/scrapy.cfg new file mode 100644 index 0000000000..081989204f --- /dev/null +++ b/web-scraping-with-scrapy-and-mongodb/books/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = books.settings + +[deploy] +#url = http://localhost:6800/ +project = books diff --git a/web-scraping-with-scrapy-and-mongodb/books/tests/__init__.py b/web-scraping-with-scrapy-and-mongodb/books/tests/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/web-scraping-with-scrapy-and-mongodb/books/tests/sample.html b/web-scraping-with-scrapy-and-mongodb/books/tests/sample.html new file mode 100644 index 0000000000..c4476d1ce9 --- /dev/null +++ b/web-scraping-with-scrapy-and-mongodb/books/tests/sample.html @@ -0,0 +1,155 @@ + + + +
+
+ +
+ +
+ + + +
+ +
+ +
+ +
+ +
+ + +
+
    + +
  1. + +
    + +
    + + A Light in the Attic + +
    + +

    + + + + + +

    + +

    A Light in the ...

    + +
    + +

    £51.77

    + +

    + + + In stock + +

    + +
    + +
    + +
    + +
    + +
  2. +
  3. + +
    + +
    + + Tipping the Velvet + +
    + +

    + + + + + +

    + +

    + + Tipping the Velvet + +

    + +
    + +

    £53.74

    + +

    + + + In stock + +

    + +
    + +
    + +
    + +
    + +
  4. +
+ +
+
    + +
  • + + Page 1 of 50 + +
  • + + + +
+
+ +
+
+ +
+ +
+
+
+ + + \ No newline at end of file diff --git a/web-scraping-with-scrapy-and-mongodb/books/tests/test_book.py b/web-scraping-with-scrapy-and-mongodb/books/tests/test_book.py new file mode 100644 index 0000000000..609f0490ed --- /dev/null +++ b/web-scraping-with-scrapy-and-mongodb/books/tests/test_book.py @@ -0,0 +1,75 @@ +import unittest +from pathlib import Path + +from scrapy.http import HtmlResponse, Request + +from books.items import BookItem +from books.spiders.book import BookSpider + + +def _get_sample_html_content(): + html_file_path = Path(__file__).parent / "sample.html" + with html_file_path.open() as html_file: + return html_file.read() + + +class BookSpiderTest(unittest.TestCase): + def setUp(self): + self.spider = BookSpider() + self.example_html = _get_sample_html_content() + self.response = HtmlResponse( + url="https://books.toscrape.com", + body=self.example_html, + encoding="utf-8", + ) + + def test_parse_scrapes_all_items(self): + """Test if the spider scrapes all books and pagination links.""" + # Collect the items produced by the generator in a list + # so that it's possible to iterate over it more than once. + results = list(self.spider.parse(self.response)) + + # There should be two book items and one pagination request + book_items = [ + item for item in results if isinstance(item, BookItem) + ] + pagination_requests = [ + item for item in results if isinstance(item, Request) + ] + + self.assertEqual(len(book_items), 2) + self.assertEqual(len(pagination_requests), 1) + + def test_parse_scrapes_correct_book_information(self): + """Test if the spider scrapes the correct information for each book.""" + results_generator = self.spider.parse(self.response) + + # Book 1 + book_1 = next(results_generator) + self.assertEqual( + book_1["url"], "catalogue/a-light-in-the-attic_1000/index.html" + ) + self.assertEqual(book_1["title"], "A Light in the Attic") + self.assertEqual(book_1["price"], "£51.77") + + # Book 2 + book_2 = next(results_generator) + self.assertEqual( + book_2["url"], "catalogue/tipping-the-velvet_999/index.html" + ) + self.assertEqual(book_2["title"], "Tipping the Velvet") + self.assertEqual(book_2["price"], "£53.74") + + def test_parse_creates_pagination_request(self): + """Test if the spider creates a pagination request correctly.""" + results = list(self.spider.parse(self.response)) + next_page_request = results[-1] + self.assertIsInstance(next_page_request, Request) + self.assertEqual( + next_page_request.url, + "https://books.toscrape.com/catalogue/page-2.html", + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/web-scraping-with-scrapy-and-mongodb/requirements.txt b/web-scraping-with-scrapy-and-mongodb/requirements.txt new file mode 100644 index 0000000000..bf2b96c05e --- /dev/null +++ b/web-scraping-with-scrapy-and-mongodb/requirements.txt @@ -0,0 +1,40 @@ +attrs==23.2.0 +Automat==22.10.0 +certifi==2024.7.4 +cffi==1.16.0 +charset-normalizer==3.3.2 +constantly==23.10.4 +cryptography==43.0.0 +cssselect==1.2.0 +defusedxml==0.7.1 +dnspython==2.6.1 +filelock==3.15.4 +hyperlink==21.0.0 +idna==3.7 +incremental==22.10.0 +itemadapter==0.9.0 +itemloaders==1.3.1 +jmespath==1.0.1 +lxml==5.2.2 +packaging==24.1 +parsel==1.9.1 +Protego==0.3.1 +pyasn1==0.6.0 +pyasn1_modules==0.4.0 +pycparser==2.22 +PyDispatcher==2.0.7 +pymongo==4.8.0 +pyOpenSSL==24.2.1 +queuelib==1.7.0 +requests==2.32.3 +requests-file==2.1.0 +Scrapy==2.11.2 +service-identity==24.1.0 +setuptools==71.1.0 +six==1.16.0 +tldextract==5.1.2 +Twisted==24.3.0 +typing_extensions==4.12.2 +urllib3==2.2.2 +w3lib==2.2.1 +zope.interface==6.4.post2 From a1b0efcb846f8741fdf21a1204a8793170a8e112 Mon Sep 17 00:00:00 2001 From: martin-martin Date: Wed, 24 Jul 2024 12:01:08 +0200 Subject: [PATCH 02/12] Fix linter error --- web-scraping-with-scrapy-and-mongodb/books/tests/test_book.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/web-scraping-with-scrapy-and-mongodb/books/tests/test_book.py b/web-scraping-with-scrapy-and-mongodb/books/tests/test_book.py index 609f0490ed..301a400a51 100644 --- a/web-scraping-with-scrapy-and-mongodb/books/tests/test_book.py +++ b/web-scraping-with-scrapy-and-mongodb/books/tests/test_book.py @@ -1,10 +1,9 @@ import unittest from pathlib import Path -from scrapy.http import HtmlResponse, Request - from books.items import BookItem from books.spiders.book import BookSpider +from scrapy.http import HtmlResponse, Request def _get_sample_html_content(): From 7f2446647c6bb445beb173ae5ec937a729f7ed80 Mon Sep 17 00:00:00 2001 From: martin-martin Date: Wed, 24 Jul 2024 15:50:19 +0200 Subject: [PATCH 03/12] =?UTF-8?q?line=20length=2079,=20not=2076=20?= =?UTF-8?q?=F0=9F=98=85?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../books/books/middlewares.py | 8 ++------ .../books/tests/test_book.py | 4 +--- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/web-scraping-with-scrapy-and-mongodb/books/books/middlewares.py b/web-scraping-with-scrapy-and-mongodb/books/books/middlewares.py index 51272c689c..428490276a 100644 --- a/web-scraping-with-scrapy-and-mongodb/books/books/middlewares.py +++ b/web-scraping-with-scrapy-and-mongodb/books/books/middlewares.py @@ -16,9 +16,7 @@ class BooksSpiderMiddleware: def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() - crawler.signals.connect( - s.spider_opened, signal=signals.spider_opened - ) + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_spider_input(self, response, spider): @@ -65,9 +63,7 @@ class BooksDownloaderMiddleware: def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() - crawler.signals.connect( - s.spider_opened, signal=signals.spider_opened - ) + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_request(self, request, spider): diff --git a/web-scraping-with-scrapy-and-mongodb/books/tests/test_book.py b/web-scraping-with-scrapy-and-mongodb/books/tests/test_book.py index 301a400a51..30f3edc4f8 100644 --- a/web-scraping-with-scrapy-and-mongodb/books/tests/test_book.py +++ b/web-scraping-with-scrapy-and-mongodb/books/tests/test_book.py @@ -29,9 +29,7 @@ def test_parse_scrapes_all_items(self): results = list(self.spider.parse(self.response)) # There should be two book items and one pagination request - book_items = [ - item for item in results if isinstance(item, BookItem) - ] + book_items = [item for item in results if isinstance(item, BookItem)] pagination_requests = [ item for item in results if isinstance(item, Request) ] From 4f2a265e0b2ec560f83c9b364b5d3027aa0d677b Mon Sep 17 00:00:00 2001 From: martin-martin Date: Fri, 26 Jul 2024 13:20:02 +0200 Subject: [PATCH 04/12] Uppercase constant name --- .../books/books/pipelines.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/web-scraping-with-scrapy-and-mongodb/books/books/pipelines.py b/web-scraping-with-scrapy-and-mongodb/books/books/pipelines.py index 590aa8d1a3..41e2e6e3aa 100644 --- a/web-scraping-with-scrapy-and-mongodb/books/books/pipelines.py +++ b/web-scraping-with-scrapy-and-mongodb/books/books/pipelines.py @@ -6,7 +6,7 @@ class MongoPipeline: - collection_name = "books" + COLLECTION_NAME = "books" def __init__(self, mongo_uri, mongo_db): self.mongo_uri = mongo_uri @@ -28,11 +28,11 @@ def close_spider(self, spider): def process_item(self, item, spider): item_id = self.compute_item_id(item) - if self.db[self.collection_name].find_one({"_id": item_id}): + if self.db[self.COLLECTION_NAME].find_one({"_id": item_id}): raise DropItem(f"Duplicate item found: {item}") else: item["_id"] = item_id - self.db[self.collection_name].insert_one( + self.db[self.COLLECTION_NAME].insert_one( ItemAdapter(item).asdict() ) return item From 49ed83831857ea561e51e7aa9d6002376954cea0 Mon Sep 17 00:00:00 2001 From: martin-martin Date: Fri, 26 Jul 2024 13:28:19 +0200 Subject: [PATCH 05/12] Update class name Co-authored-by: Bartosz --- web-scraping-with-scrapy-and-mongodb/books/books/items.py | 2 +- .../books/books/spiders/book.py | 5 ++--- .../books/tests/test_book.py | 4 ++-- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/web-scraping-with-scrapy-and-mongodb/books/books/items.py b/web-scraping-with-scrapy-and-mongodb/books/books/items.py index b7013a9c25..9205771c7b 100644 --- a/web-scraping-with-scrapy-and-mongodb/books/books/items.py +++ b/web-scraping-with-scrapy-and-mongodb/books/books/items.py @@ -1,7 +1,7 @@ import scrapy -class BookItem(scrapy.Item): +class BooksItem(scrapy.Item): _id = scrapy.Field() url = scrapy.Field() title = scrapy.Field() diff --git a/web-scraping-with-scrapy-and-mongodb/books/books/spiders/book.py b/web-scraping-with-scrapy-and-mongodb/books/books/spiders/book.py index 2c0054dc6d..a184375532 100644 --- a/web-scraping-with-scrapy-and-mongodb/books/books/spiders/book.py +++ b/web-scraping-with-scrapy-and-mongodb/books/books/spiders/book.py @@ -1,6 +1,5 @@ import scrapy - -from books.items import BookItem +from books.items import BooksItem class BookSpider(scrapy.Spider): @@ -22,7 +21,7 @@ def parse(self, response): @scrapes url title price """ for book in response.css("article.product_pod"): - item = BookItem() + item = BooksItem() item["url"] = book.css("h3 > a::attr(href)").get() item["title"] = book.css("h3 > a::attr(title)").get() item["price"] = book.css(".price_color::text").get() diff --git a/web-scraping-with-scrapy-and-mongodb/books/tests/test_book.py b/web-scraping-with-scrapy-and-mongodb/books/tests/test_book.py index 30f3edc4f8..c1d308f635 100644 --- a/web-scraping-with-scrapy-and-mongodb/books/tests/test_book.py +++ b/web-scraping-with-scrapy-and-mongodb/books/tests/test_book.py @@ -1,7 +1,7 @@ import unittest from pathlib import Path -from books.items import BookItem +from books.items import BooksItem from books.spiders.book import BookSpider from scrapy.http import HtmlResponse, Request @@ -29,7 +29,7 @@ def test_parse_scrapes_all_items(self): results = list(self.spider.parse(self.response)) # There should be two book items and one pagination request - book_items = [item for item in results if isinstance(item, BookItem)] + book_items = [item for item in results if isinstance(item, BooksItem)] pagination_requests = [ item for item in results if isinstance(item, Request) ] From 1dcdaf93a13c4c7ffa606688e69bd9dbf3a7220c Mon Sep 17 00:00:00 2001 From: martin-martin Date: Fri, 26 Jul 2024 13:37:32 +0200 Subject: [PATCH 06/12] Fix ruff error --- web-scraping-with-scrapy-and-mongodb/books/books/spiders/book.py | 1 + 1 file changed, 1 insertion(+) diff --git a/web-scraping-with-scrapy-and-mongodb/books/books/spiders/book.py b/web-scraping-with-scrapy-and-mongodb/books/books/spiders/book.py index a184375532..4d670eb071 100644 --- a/web-scraping-with-scrapy-and-mongodb/books/books/spiders/book.py +++ b/web-scraping-with-scrapy-and-mongodb/books/books/spiders/book.py @@ -1,4 +1,5 @@ import scrapy + from books.items import BooksItem From 67c02e041cfa902c1bf8b66af222ac0d907e9826 Mon Sep 17 00:00:00 2001 From: Martin Breuss Date: Fri, 2 Aug 2024 18:47:52 +0200 Subject: [PATCH 07/12] Apply feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Bartosz Zaczyński --- web-scraping-with-scrapy-and-mongodb/books/tests/test_book.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/web-scraping-with-scrapy-and-mongodb/books/tests/test_book.py b/web-scraping-with-scrapy-and-mongodb/books/tests/test_book.py index c1d308f635..db08bf9967 100644 --- a/web-scraping-with-scrapy-and-mongodb/books/tests/test_book.py +++ b/web-scraping-with-scrapy-and-mongodb/books/tests/test_book.py @@ -8,8 +8,7 @@ def _get_sample_html_content(): html_file_path = Path(__file__).parent / "sample.html" - with html_file_path.open() as html_file: - return html_file.read() + return html_file_path.read_text("utf-8") class BookSpiderTest(unittest.TestCase): From c7904d447d3cff97018ec34f54206b4cbe22fc1c Mon Sep 17 00:00:00 2001 From: Leodanis Pozo Ramos Date: Wed, 14 Aug 2024 17:37:59 +0200 Subject: [PATCH 08/12] Sample code for the article on lists vs tuples --- python-lists-tuples/README.md | 3 +++ python-lists-tuples/create-lists-tuples.py | 17 +++++++++++++++++ python-lists-tuples/functions.py | 5 +++++ python-lists-tuples/list-methods.py | 19 +++++++++++++++++++ python-lists-tuples/nested-lists.py | 11 +++++++++++ python-lists-tuples/operators.py | 7 +++++++ python-lists-tuples/remove-items.py | 7 +++++++ python-lists-tuples/slicing.py | 7 +++++++ python-lists-tuples/unpacking.py | 20 ++++++++++++++++++++ 9 files changed, 96 insertions(+) create mode 100644 python-lists-tuples/README.md create mode 100644 python-lists-tuples/create-lists-tuples.py create mode 100644 python-lists-tuples/functions.py create mode 100644 python-lists-tuples/list-methods.py create mode 100644 python-lists-tuples/nested-lists.py create mode 100644 python-lists-tuples/operators.py create mode 100644 python-lists-tuples/remove-items.py create mode 100644 python-lists-tuples/slicing.py create mode 100644 python-lists-tuples/unpacking.py diff --git a/python-lists-tuples/README.md b/python-lists-tuples/README.md new file mode 100644 index 0000000000..a46ec61c53 --- /dev/null +++ b/python-lists-tuples/README.md @@ -0,0 +1,3 @@ +# Lists vs Tuples in Python + +This folder provides the code examples for the Real Python tutorial [Lists vs Tuples in Python](https://realpython.com/python-lists-tuples/). diff --git a/python-lists-tuples/create-lists-tuples.py b/python-lists-tuples/create-lists-tuples.py new file mode 100644 index 0000000000..6446e447ae --- /dev/null +++ b/python-lists-tuples/create-lists-tuples.py @@ -0,0 +1,17 @@ +colors = ["red", "green", "blue", "yellow"] +print(colors) + +person = ("Jane Doe", 25, "Python Developer", "Canada") +print(person) + +digits = list(range(10)) +print(digits) + +even_digits = [number for number in range(1, 10) if not number % 2] +print(even_digits) + +print(["Pythonista", 7, False, 3.14159]) +print(("Pythonista", 7, False, 3.14159)) + +print(list(range(1_000_000))) +print(tuple(range(1_000_000))) diff --git a/python-lists-tuples/functions.py b/python-lists-tuples/functions.py new file mode 100644 index 0000000000..b2798c3af0 --- /dev/null +++ b/python-lists-tuples/functions.py @@ -0,0 +1,5 @@ +numbers = [2, 7, 5, 4, 8] +print(len(numbers)) +print(min(numbers)) +print(max(numbers)) +print(sum(numbers)) diff --git a/python-lists-tuples/list-methods.py b/python-lists-tuples/list-methods.py new file mode 100644 index 0000000000..c6fd8618d2 --- /dev/null +++ b/python-lists-tuples/list-methods.py @@ -0,0 +1,19 @@ +a = ["a", "b"] +a.append("c") +print(a) + +a = ["a", "c"] +a.insert(1, "b") +print(a) + +a = ["a", "b", "c", "d", "e"] +a.remove("b") +print(a) +a.remove("c") +print(a) + +a = ["a", "b", "c", "d", "e"] +a.pop() +print(a) +a.pop() +print(a) diff --git a/python-lists-tuples/nested-lists.py b/python-lists-tuples/nested-lists.py new file mode 100644 index 0000000000..0098dc7977 --- /dev/null +++ b/python-lists-tuples/nested-lists.py @@ -0,0 +1,11 @@ +x = ["a", ["bb", ["ccc", "ddd"], "ee", "ff"], "g", ["hh", "ii"], "j"] + +print(x[0], x[2], x[4]) + +print(x[1]) +print(x[3]) + +print(x[1][0]) +print(x[1][1]) +print(x[1][2]) +print(x[1][3]) diff --git a/python-lists-tuples/operators.py b/python-lists-tuples/operators.py new file mode 100644 index 0000000000..decf26f797 --- /dev/null +++ b/python-lists-tuples/operators.py @@ -0,0 +1,7 @@ +words = ["foo", "bar", "baz", "qux", "quux", "corge"] +print("qux" in words) +print("py" in words) +print("thud" not in words) + +print(words + ["grault", "garply"]) +print(words * 2) diff --git a/python-lists-tuples/remove-items.py b/python-lists-tuples/remove-items.py new file mode 100644 index 0000000000..087c1c8bde --- /dev/null +++ b/python-lists-tuples/remove-items.py @@ -0,0 +1,7 @@ +fruits = ["apple", "orange", "mango", "grape"] +del fruits[0] # Remove apple + +print(fruits) + +person = ("John Doe", 35, "Web Dev") +# del person[1] # Try to remove the age value diff --git a/python-lists-tuples/slicing.py b/python-lists-tuples/slicing.py new file mode 100644 index 0000000000..2997ea7ad2 --- /dev/null +++ b/python-lists-tuples/slicing.py @@ -0,0 +1,7 @@ +numbers = [1, 2, 3, 7] +numbers[3:4] = [4, 5, 6, 7] +print(numbers) + +numbers = [1, 2, 3, 7] +numbers[3:3] = [4, 5, 6] +print(numbers) diff --git a/python-lists-tuples/unpacking.py b/python-lists-tuples/unpacking.py new file mode 100644 index 0000000000..5cdcc2d09f --- /dev/null +++ b/python-lists-tuples/unpacking.py @@ -0,0 +1,20 @@ +t = ("foo", "bar", "baz", "qux") + +s1, s2, s3, s4 = t +print(s1) +print(s2) +print(s3) +print(s4) + +a = "foo" +b = "bar" +# Using a temporary variable +temp = a +a = b +b = temp +(a, b) +a = "foo" +b = "bar" +# Using unpacking +a, b = b, a +a, b From fee31152737edaf0a52bdb3964e08c911d09c566 Mon Sep 17 00:00:00 2001 From: Leodanis Pozo Ramos Date: Thu, 15 Aug 2024 14:41:25 +0200 Subject: [PATCH 09/12] TR updates, first round --- .../{create-lists-tuples.py => create_lists_tuples.py} | 0 python-lists-tuples/{list-methods.py => list_methods.py} | 0 python-lists-tuples/{nested-lists.py => nested_lists.py} | 0 python-lists-tuples/{remove-items.py => remove_items.py} | 0 python-lists-tuples/unpacking.py | 5 +++-- 5 files changed, 3 insertions(+), 2 deletions(-) rename python-lists-tuples/{create-lists-tuples.py => create_lists_tuples.py} (100%) rename python-lists-tuples/{list-methods.py => list_methods.py} (100%) rename python-lists-tuples/{nested-lists.py => nested_lists.py} (100%) rename python-lists-tuples/{remove-items.py => remove_items.py} (100%) diff --git a/python-lists-tuples/create-lists-tuples.py b/python-lists-tuples/create_lists_tuples.py similarity index 100% rename from python-lists-tuples/create-lists-tuples.py rename to python-lists-tuples/create_lists_tuples.py diff --git a/python-lists-tuples/list-methods.py b/python-lists-tuples/list_methods.py similarity index 100% rename from python-lists-tuples/list-methods.py rename to python-lists-tuples/list_methods.py diff --git a/python-lists-tuples/nested-lists.py b/python-lists-tuples/nested_lists.py similarity index 100% rename from python-lists-tuples/nested-lists.py rename to python-lists-tuples/nested_lists.py diff --git a/python-lists-tuples/remove-items.py b/python-lists-tuples/remove_items.py similarity index 100% rename from python-lists-tuples/remove-items.py rename to python-lists-tuples/remove_items.py diff --git a/python-lists-tuples/unpacking.py b/python-lists-tuples/unpacking.py index 5cdcc2d09f..2f4681a7ff 100644 --- a/python-lists-tuples/unpacking.py +++ b/python-lists-tuples/unpacking.py @@ -12,9 +12,10 @@ temp = a a = b b = temp -(a, b) +print(a, b) + a = "foo" b = "bar" # Using unpacking a, b = b, a -a, b +print(a, b) From d705ca47939020d0bbfb040221bb591c81f35d76 Mon Sep 17 00:00:00 2001 From: martin-martin Date: Fri, 16 Aug 2024 14:12:24 +0200 Subject: [PATCH 10/12] Update code for new API and model versions --- openai-dalle/README.md | 9 +++++---- openai-dalle/create.py | 14 +++++++------- openai-dalle/create_dalle3.py | 14 ++++++++++++++ openai-dalle/edit.py | 13 ++++++------- openai-dalle/requirements.txt | 31 +++++++++++++++---------------- openai-dalle/vary.py | 13 ++++++------- 6 files changed, 53 insertions(+), 41 deletions(-) create mode 100644 openai-dalle/create_dalle3.py diff --git a/openai-dalle/README.md b/openai-dalle/README.md index 0f27dc881f..2e92561c3a 100644 --- a/openai-dalle/README.md +++ b/openai-dalle/README.md @@ -1,6 +1,6 @@ # Generate Images With DALL·E 2 and the OpenAI API -Learn to use the OpenAI Python library to create images with DALL·E, a state-of-the-art latent diffusion model. In the associated tutorial on [generating images with DALL·E 2 and the OpenAI API](https://realpython.com/generate-images-with-dalle-openai-api/), you explore image creation and generating image variations. You learn how to interact with DALL·E using API calls and incorporate this functionality into your Python scripts. +Learn to use the OpenAI Python library to create images with DALL·E, a state-of-the-art latent diffusion model. In the associated tutorial on [generating images with DALL·E and the OpenAI API](https://realpython.com/generate-images-with-dalle-openai-api/), you explore image creation and generating image variations. You learn how to interact with DALL·E using API calls and incorporate this functionality into your Python scripts. ## Setup @@ -8,7 +8,7 @@ Create and activate a virtual environment, then install the `openai` package: ```console $ python --version -Python 3.11.0 +Python 3.12.5 $ python -m venv venv $ source venv/bin/activate (venv) $ python -m pip install openai @@ -22,7 +22,8 @@ Follow the instructions in [the tutorial](https://realpython.com/generate-images You can find the code for each of these steps in dedicated scripts: -- `create.py`: Create an image from a text prompt and save the image data to a file. +- `create_dalle3.py`: Create an image from a text prompt using DALL·E 3 and save the image data to a file. +- `create.py`: Create an image from a text prompt using DALL·E 2 and save the image data to a file. - `convert.py`: Convert a Base64-encoded PNG image delivered in a JSON response to a PNG image file. - `vary.py`: Read Base64-encoded image data and make an API request to receive variations of that image. @@ -30,6 +31,6 @@ In the tutorial, you'll walk through each of these scripts and their functionali ## Edit Images (Inpainting and Outpainting) -The OpenAI Image API also allows you to [edit parts of an image](https://beta.openai.com/docs/guides/images/edits) using text prompts. For this, you need to create a mask with transparent image data in the area where you want to edit the image. +The OpenAI Image API also allows you to [edit parts of an image](https://platform.openai.com/docs/guides/images/edits-dall-e-2-only) using text prompts. For this, you need to create a mask with transparent image data in the area where you want to edit the image. You can run `edit.py` to give this functionality a try. diff --git a/openai-dalle/create.py b/openai-dalle/create.py index 7836d40ef7..7ad30ff80e 100644 --- a/openai-dalle/create.py +++ b/openai-dalle/create.py @@ -1,24 +1,24 @@ import json -import os from pathlib import Path -import openai +from openai import OpenAI + +client = OpenAI() PROMPT = "An eco-friendly computer from the 90s in the style of vaporwave" DATA_DIR = Path.cwd() / "responses" DATA_DIR.mkdir(exist_ok=True) -openai.api_key = os.getenv("OPENAI_API_KEY") - -response = openai.Image.create( +response = client.images.generate( + model="dall-e-2", prompt=PROMPT, n=1, size="256x256", response_format="b64_json", ) -file_name = DATA_DIR / f"{PROMPT[:5]}-{response['created']}.json" +file_name = DATA_DIR / f"{PROMPT[:5]}-{response.created}.json" with open(file_name, mode="w", encoding="utf-8") as file: - json.dump(response, file) + json.dump(response.to_dict(), file) diff --git a/openai-dalle/create_dalle3.py b/openai-dalle/create_dalle3.py new file mode 100644 index 0000000000..d21b3997da --- /dev/null +++ b/openai-dalle/create_dalle3.py @@ -0,0 +1,14 @@ +from openai import OpenAI + +client = OpenAI() + +PROMPT = "A vaporwave computer" + + +response = client.images.generate( + model="dall-e-3", + prompt=PROMPT, +) + +print(response.data[0].url) +print(response.data[0].revised_prompt) diff --git a/openai-dalle/edit.py b/openai-dalle/edit.py index 96473453a1..ff1cfe1c4f 100644 --- a/openai-dalle/edit.py +++ b/openai-dalle/edit.py @@ -1,8 +1,9 @@ import json -import os from pathlib import Path -import openai +from openai import OpenAI + +client = OpenAI() SOURCE_PATH = Path.cwd() / "images" / "An ec-1667994848" DESTINATION_PATH = Path.cwd() / "responses" @@ -11,9 +12,7 @@ SOURCE_PATH.mkdir(parents=True, exist_ok=True) DESTINATION_PATH.mkdir(parents=True, exist_ok=True) -openai.api_key = os.getenv("OPENAI_API_KEY") - -response = openai.Image.create_edit( +response = client.images.edit( image=open(SOURCE_PATH / "computer.png", mode="rb"), mask=open(SOURCE_PATH / "mask.png", mode="rb"), prompt=PROMPT, @@ -23,8 +22,8 @@ ) with open( - DESTINATION_PATH / f"edit-{PROMPT[:5]}-{response['created']}.json", + DESTINATION_PATH / f"edit-{PROMPT[:5]}-{response.created}.json", mode="w", encoding="utf-8", ) as file: - json.dump(response, file) + json.dump(response.to_dict(), file) diff --git a/openai-dalle/requirements.txt b/openai-dalle/requirements.txt index 85fd283124..ab13983f8e 100644 --- a/openai-dalle/requirements.txt +++ b/openai-dalle/requirements.txt @@ -1,16 +1,15 @@ -certifi==2022.9.24 -charset-normalizer==2.1.1 -et-xmlfile==1.1.0 -idna==3.4 -numpy==1.23.4 -openai==0.25.0 -openpyxl==3.0.10 -pandas==1.5.1 -pandas-stubs==1.2.0.62 -python-dateutil==2.8.2 -pytz==2022.6 -requests==2.28.1 -six==1.16.0 -tqdm==4.64.1 -typing_extensions==4.4.0 -urllib3==1.26.12 +annotated-types==0.7.0 +anyio==4.4.0 +certifi==2024.7.4 +distro==1.9.0 +h11==0.14.0 +httpcore==1.0.5 +httpx==0.27.0 +idna==3.7 +jiter==0.5.0 +openai==1.40.6 +pydantic==2.8.2 +pydantic_core==2.20.1 +sniffio==1.3.1 +tqdm==4.66.5 +typing_extensions==4.12.2 diff --git a/openai-dalle/vary.py b/openai-dalle/vary.py index c56777efca..ce27e64ca0 100644 --- a/openai-dalle/vary.py +++ b/openai-dalle/vary.py @@ -1,27 +1,26 @@ import json -import os from base64 import b64decode from pathlib import Path -import openai +from openai import OpenAI + +client = OpenAI() DATA_DIR = Path.cwd() / "responses" SOURCE_FILE = DATA_DIR / "An ec-1667994848.json" -openai.api_key = os.getenv("OPENAI_API_KEY") - with open(SOURCE_FILE, mode="r", encoding="utf-8") as json_file: saved_response = json.load(json_file) image_data = b64decode(saved_response["data"][0]["b64_json"]) -response = openai.Image.create_variation( +response = client.images.create_variation( image=image_data, n=3, size="256x256", response_format="b64_json", ) -new_file_name = f"vary-{SOURCE_FILE.stem[:5]}-{response['created']}.json" +new_file_name = f"vary-{SOURCE_FILE.stem[:5]}-{response.created}.json" with open(DATA_DIR / new_file_name, mode="w", encoding="utf-8") as file: - json.dump(response, file) + json.dump(response.to_dict(), file) From 126460f42af5082ce7f72c796b3e08e6064cbf8f Mon Sep 17 00:00:00 2001 From: brendaweles <160772586+brendaweles@users.noreply.github.com> Date: Tue, 20 Aug 2024 17:26:36 -0600 Subject: [PATCH 11/12] Language Edit --- openai-dalle/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/openai-dalle/README.md b/openai-dalle/README.md index 2e92561c3a..465eae4891 100644 --- a/openai-dalle/README.md +++ b/openai-dalle/README.md @@ -1,6 +1,6 @@ -# Generate Images With DALL·E 2 and the OpenAI API +# Generate Images With DALL·E and the OpenAI API -Learn to use the OpenAI Python library to create images with DALL·E, a state-of-the-art latent diffusion model. In the associated tutorial on [generating images with DALL·E and the OpenAI API](https://realpython.com/generate-images-with-dalle-openai-api/), you explore image creation and generating image variations. You learn how to interact with DALL·E using API calls and incorporate this functionality into your Python scripts. +Learn to use the OpenAI Python library to create images with DALL·E, a state-of-the-art latent diffusion model. In the associated tutorial on [generating images with DALL·E and the OpenAI API](https://realpython.com/generate-images-with-dalle-openai-api/), you'll explore image creation and generating image variations. You'll learn how to interact with DALL·E using API calls and incorporate this functionality into your Python scripts. ## Setup From f5140c8d5cfe7aa37ad35e3fe52234990b0113c4 Mon Sep 17 00:00:00 2001 From: Geir Arne Hjelle Date: Thu, 22 Aug 2024 11:48:20 +0200 Subject: [PATCH 12/12] Final QA (#571) --- openai-dalle/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openai-dalle/README.md b/openai-dalle/README.md index 465eae4891..7acc27bef7 100644 --- a/openai-dalle/README.md +++ b/openai-dalle/README.md @@ -22,7 +22,7 @@ Follow the instructions in [the tutorial](https://realpython.com/generate-images You can find the code for each of these steps in dedicated scripts: -- `create_dalle3.py`: Create an image from a text prompt using DALL·E 3 and save the image data to a file. +- `create_dalle3.py`: Create an image from a text prompt using DALL·E 3 and display the URL to the image. - `create.py`: Create an image from a text prompt using DALL·E 2 and save the image data to a file. - `convert.py`: Convert a Base64-encoded PNG image delivered in a JSON response to a PNG image file. - `vary.py`: Read Base64-encoded image data and make an API request to receive variations of that image.