From bc1c95af6a579b72d152d6bf33ae432741362f26 Mon Sep 17 00:00:00 2001 From: Florent Ravenel Date: Mon, 9 Oct 2023 09:18:50 +0200 Subject: [PATCH 1/3] feat(Python): Add Download Webpage as PDF --- Python/Python_Download_Webpage_as_PDF.ipynb | 263 ++++++++++++++++++++ 1 file changed, 263 insertions(+) create mode 100644 Python/Python_Download_Webpage_as_PDF.ipynb diff --git a/Python/Python_Download_Webpage_as_PDF.ipynb b/Python/Python_Download_Webpage_as_PDF.ipynb new file mode 100644 index 0000000000..311289e137 --- /dev/null +++ b/Python/Python_Download_Webpage_as_PDF.ipynb @@ -0,0 +1,263 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "f10d5e39-7e5d-476f-8e2b-3a14ec878773", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "\"Naas\"" + ] + }, + { + "cell_type": "markdown", + "id": "699b8293-3e19-4e8c-9732-668e002efe19", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "# Python - Download Webpage as PDF" + ] + }, + { + "cell_type": "markdown", + "id": "acc3b5ee-6154-4199-9534-da3504743cc3", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "**Tags:** #python #webpage #download #pdf #library #urllib" + ] + }, + { + "cell_type": "markdown", + "id": "da2f66ee-bab4-483a-bc79-cb42ace8d2a1", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "**Author:** [Firstname Lastname]()" + ] + }, + { + "cell_type": "markdown", + "id": "52d2a5d6-53c1-4da3-ba02-b2cc2c491f97", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "**Last update:** 2023-10-09 (Created: 2023-10-09)" + ] + }, + { + "cell_type": "markdown", + "id": "a3679e7a-44b8-48cc-92f6-faaa9ce4e088", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "**Description:** This notebook downloads a webpage as a PDF file using the Python library urllib. It is usefull for organizations that need to save webpages for archiving or for further analysis." + ] + }, + { + "cell_type": "markdown", + "id": "f0de625f-9545-44bb-9ef6-4819c2d6bc50", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "**References:**\n- [urllib Documentation](https://docs.python.org/3/library/urllib.html)\n- [urllib Tutorial](https://www.tutorialspoint.com/python/python_urllib.htm)" + ] + }, + { + "cell_type": "markdown", + "id": "309e478b-03e6-4c0e-b0ac-0ce68f3eef2c", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "## Input" + ] + }, + { + "cell_type": "markdown", + "id": "8283e962-8c3a-40d8-b3a3-9a2f043436aa", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "### Import libraries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bf4d6821-ca76-4cad-92d9-2535f4c877a4", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": "import urllib.request", + "outputs": [] + }, + { + "cell_type": "markdown", + "id": "699e12f4-262d-45bc-a935-9f27070c02e9", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "### Setup variables\n- `url`: URL of the webpage to be downloaded" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "737c5d30-d65e-4e86-86aa-dc832c69fe44", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": "url = \"https://www.example.com\"", + "outputs": [] + }, + { + "cell_type": "markdown", + "id": "fe70d837-f74e-49d0-b1c3-1821347fb592", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "## Model" + ] + }, + { + "cell_type": "markdown", + "id": "31968d51-d39e-4418-a7c7-9273949ff799", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "### Download webpage as PDF" + ] + }, + { + "cell_type": "markdown", + "id": "b0617cd8-a7ec-4101-9b3f-f70ac1543621", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "Download the webpage as a PDF file using the `urlretrieve` function from the `urllib` library." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27ee53ca-333a-4e7f-a6d3-20b25227ecd1", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": "urllib.request.urlretrieve(url, \"webpage.pdf\")", + "outputs": [] + }, + { + "cell_type": "markdown", + "id": "aefdb14c-f36f-4417-aa99-1b37e26be06b", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "## Output" + ] + }, + { + "cell_type": "markdown", + "id": "f7397b8d-6152-4d35-8f2f-01995e5a8858", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "### Display result" + ] + }, + { + "cell_type": "markdown", + "id": "b3cfe3d5-119b-43a3-ba59-84931425c1ff", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "The webpage has been downloaded as a PDF file." + ] + }, + { + "cell_type": "markdown", + "id": "be9c4219-79a5-4668-97f4-38ea61ffbbc2", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + " " + ] + }, + { + "cell_type": "markdown", + "id": "b91cb912-ff76-428c-8e57-56341e93181f", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + " " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file From f7163f95e9f3ed5ad35ab4c0fb4d53bdcf357a1a Mon Sep 17 00:00:00 2001 From: mnsvag Date: Mon, 16 Oct 2023 08:02:30 +0200 Subject: [PATCH 2/3] feat: add code to download web page as pdf --- Python/Python_Download_Webpage_as_PDF.ipynb | 110 +++++++++++++++++--- 1 file changed, 94 insertions(+), 16 deletions(-) diff --git a/Python/Python_Download_Webpage_as_PDF.ipynb b/Python/Python_Download_Webpage_as_PDF.ipynb index 311289e137..f9c0259ebb 100644 --- a/Python/Python_Download_Webpage_as_PDF.ipynb +++ b/Python/Python_Download_Webpage_as_PDF.ipynb @@ -41,7 +41,7 @@ "tags": [] }, "source": [ - "**Author:** [Firstname Lastname]()" + "**Author:** [Manasvi Agarwal]()" ] }, { @@ -52,7 +52,7 @@ "tags": [] }, "source": [ - "**Last update:** 2023-10-09 (Created: 2023-10-09)" + "**Last update:** 2023-10-16 (Created: 2023-10-09)" ] }, { @@ -74,7 +74,8 @@ "tags": [] }, "source": [ - "**References:**\n- [urllib Documentation](https://docs.python.org/3/library/urllib.html)\n- [urllib Tutorial](https://www.tutorialspoint.com/python/python_urllib.htm)" + "**References:**\n", + "- [weasyprint Documentation](https://doc.courtbouillon.org/weasyprint/stable/)" ] }, { @@ -101,14 +102,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "bf4d6821-ca76-4cad-92d9-2535f4c877a4", "metadata": { + "execution": { + "iopub.execute_input": "2023-10-16T05:53:23.895258Z", + "iopub.status.busy": "2023-10-16T05:53:23.895027Z", + "iopub.status.idle": "2023-10-16T05:53:23.898308Z", + "shell.execute_reply": "2023-10-16T05:53:23.897653Z", + "shell.execute_reply.started": "2023-10-16T05:53:23.895236Z" + }, "papermill": {}, "tags": [] }, - "source": "import urllib.request", - "outputs": [] + "outputs": [], + "source": [ + "from weasyprint import HTML" + ] }, { "cell_type": "markdown", @@ -118,19 +128,48 @@ "tags": [] }, "source": [ - "### Setup variables\n- `url`: URL of the webpage to be downloaded" + "### Setup variables\n", + "- `url`: URL of the webpage to be downloaded" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "737c5d30-d65e-4e86-86aa-dc832c69fe44", "metadata": { + "execution": { + "iopub.execute_input": "2023-10-16T05:53:25.163098Z", + "iopub.status.busy": "2023-10-16T05:53:25.162871Z", + "iopub.status.idle": "2023-10-16T05:53:25.165963Z", + "shell.execute_reply": "2023-10-16T05:53:25.165290Z", + "shell.execute_reply.started": "2023-10-16T05:53:25.163075Z" + }, "papermill": {}, "tags": [] }, - "source": "url = \"https://www.example.com\"", - "outputs": [] + "outputs": [], + "source": [ + "url = \"https://www.google.com\"" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "804a41a0-5bdf-4f79-bceb-eebf47a6554a", + "metadata": { + "execution": { + "iopub.execute_input": "2023-10-16T05:53:25.721944Z", + "iopub.status.busy": "2023-10-16T05:53:25.721684Z", + "iopub.status.idle": "2023-10-16T05:53:25.724770Z", + "shell.execute_reply": "2023-10-16T05:53:25.724130Z", + "shell.execute_reply.started": "2023-10-16T05:53:25.721907Z" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "output_pdf_path = 'webpage.pdf'" + ] }, { "cell_type": "markdown", @@ -167,14 +206,26 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "27ee53ca-333a-4e7f-a6d3-20b25227ecd1", + "execution_count": 13, + "id": "a3b86f57-f170-4832-ad36-58cd1d0e25e6", "metadata": { - "papermill": {}, + "execution": { + "iopub.execute_input": "2023-10-16T05:53:28.351404Z", + "iopub.status.busy": "2023-10-16T05:53:28.351171Z", + "iopub.status.idle": "2023-10-16T05:53:28.711058Z", + "shell.execute_reply": "2023-10-16T05:53:28.710476Z", + "shell.execute_reply.started": "2023-10-16T05:53:28.351380Z" + }, "tags": [] }, - "source": "urllib.request.urlretrieve(url, \"webpage.pdf\")", - "outputs": [] + "outputs": [], + "source": [ + "try:\n", + " pdf = HTML(url).write_pdf()\n", + " open(output_pdf_path, 'wb').write(pdf)\n", + "except Exception as e:\n", + " print(f\"An error occurred: {e}\")" + ] }, { "cell_type": "markdown", @@ -198,6 +249,33 @@ "### Display result" ] }, + { + "cell_type": "code", + "execution_count": 14, + "id": "0de6ab5b-99c8-41e2-b709-af44b3d2b385", + "metadata": { + "execution": { + "iopub.execute_input": "2023-10-16T05:53:30.200354Z", + "iopub.status.busy": "2023-10-16T05:53:30.200127Z", + "iopub.status.idle": "2023-10-16T05:53:30.205621Z", + "shell.execute_reply": "2023-10-16T05:53:30.204962Z", + "shell.execute_reply.started": "2023-10-16T05:53:30.200331Z" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Webpage downloaded as PDF to webpage.pdf\n" + ] + } + ], + "source": [ + "print(f\"Webpage downloaded as PDF to {output_pdf_path}\")" + ] + }, { "cell_type": "markdown", "id": "b3cfe3d5-119b-43a3-ba59-84931425c1ff", @@ -260,4 +338,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} From f0bb334b2dcf395999b4fd62abf03a81c9937422 Mon Sep 17 00:00:00 2001 From: mnsvag Date: Mon, 16 Oct 2023 08:15:52 +0200 Subject: [PATCH 3/3] fix: documentation, variable names --- Python/Python_Download_Webpage_as_PDF.ipynb | 94 ++++++++------------- 1 file changed, 33 insertions(+), 61 deletions(-) diff --git a/Python/Python_Download_Webpage_as_PDF.ipynb b/Python/Python_Download_Webpage_as_PDF.ipynb index f9c0259ebb..729af19ced 100644 --- a/Python/Python_Download_Webpage_as_PDF.ipynb +++ b/Python/Python_Download_Webpage_as_PDF.ipynb @@ -41,7 +41,7 @@ "tags": [] }, "source": [ - "**Author:** [Manasvi Agarwal]()" + "**Author:** [Manasvi Agarwal](https://www.linkedin.com/in/mnsv/)" ] }, { @@ -63,7 +63,7 @@ "tags": [] }, "source": [ - "**Description:** This notebook downloads a webpage as a PDF file using the Python library urllib. It is usefull for organizations that need to save webpages for archiving or for further analysis." + "**Description:** This notebook downloads a webpage as a PDF file using the Python library weasyprint. It is useful for organizations that need to save webpages for archiving or for further analysis." ] }, { @@ -102,15 +102,15 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 16, "id": "bf4d6821-ca76-4cad-92d9-2535f4c877a4", "metadata": { "execution": { - "iopub.execute_input": "2023-10-16T05:53:23.895258Z", - "iopub.status.busy": "2023-10-16T05:53:23.895027Z", - "iopub.status.idle": "2023-10-16T05:53:23.898308Z", - "shell.execute_reply": "2023-10-16T05:53:23.897653Z", - "shell.execute_reply.started": "2023-10-16T05:53:23.895236Z" + "iopub.execute_input": "2023-10-16T06:07:44.816453Z", + "iopub.status.busy": "2023-10-16T06:07:44.816157Z", + "iopub.status.idle": "2023-10-16T06:07:44.825636Z", + "shell.execute_reply": "2023-10-16T06:07:44.824969Z", + "shell.execute_reply.started": "2023-10-16T06:07:44.816420Z" }, "papermill": {}, "tags": [] @@ -129,46 +129,29 @@ }, "source": [ "### Setup variables\n", - "- `url`: URL of the webpage to be downloaded" + "- `url`: URL of the webpage to be downloaded\n", + "- `webpage_path`: Path where output PDF file will be downloaded" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 17, "id": "737c5d30-d65e-4e86-86aa-dc832c69fe44", "metadata": { "execution": { - "iopub.execute_input": "2023-10-16T05:53:25.163098Z", - "iopub.status.busy": "2023-10-16T05:53:25.162871Z", - "iopub.status.idle": "2023-10-16T05:53:25.165963Z", - "shell.execute_reply": "2023-10-16T05:53:25.165290Z", - "shell.execute_reply.started": "2023-10-16T05:53:25.163075Z" + "iopub.execute_input": "2023-10-16T06:07:46.582693Z", + "iopub.status.busy": "2023-10-16T06:07:46.582387Z", + "iopub.status.idle": "2023-10-16T06:07:46.585926Z", + "shell.execute_reply": "2023-10-16T06:07:46.585292Z", + "shell.execute_reply.started": "2023-10-16T06:07:46.582656Z" }, "papermill": {}, "tags": [] }, "outputs": [], "source": [ - "url = \"https://www.google.com\"" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "804a41a0-5bdf-4f79-bceb-eebf47a6554a", - "metadata": { - "execution": { - "iopub.execute_input": "2023-10-16T05:53:25.721944Z", - "iopub.status.busy": "2023-10-16T05:53:25.721684Z", - "iopub.status.idle": "2023-10-16T05:53:25.724770Z", - "shell.execute_reply": "2023-10-16T05:53:25.724130Z", - "shell.execute_reply.started": "2023-10-16T05:53:25.721907Z" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "output_pdf_path = 'webpage.pdf'" + "url = \"https://www.example.com\"\n", + "webpage_path = 'webpage.pdf'" ] }, { @@ -201,20 +184,20 @@ "tags": [] }, "source": [ - "Download the webpage as a PDF file using the `urlretrieve` function from the `urllib` library." + "Download the webpage as a PDF file using the `HTML write_pdf` function from the `weasyprint` library." ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 18, "id": "a3b86f57-f170-4832-ad36-58cd1d0e25e6", "metadata": { "execution": { - "iopub.execute_input": "2023-10-16T05:53:28.351404Z", - "iopub.status.busy": "2023-10-16T05:53:28.351171Z", - "iopub.status.idle": "2023-10-16T05:53:28.711058Z", - "shell.execute_reply": "2023-10-16T05:53:28.710476Z", - "shell.execute_reply.started": "2023-10-16T05:53:28.351380Z" + "iopub.execute_input": "2023-10-16T06:07:49.397187Z", + "iopub.status.busy": "2023-10-16T06:07:49.396824Z", + "iopub.status.idle": "2023-10-16T06:07:49.973517Z", + "shell.execute_reply": "2023-10-16T06:07:49.972747Z", + "shell.execute_reply.started": "2023-10-16T06:07:49.397149Z" }, "tags": [] }, @@ -222,7 +205,7 @@ "source": [ "try:\n", " pdf = HTML(url).write_pdf()\n", - " open(output_pdf_path, 'wb').write(pdf)\n", + " open(webpage_path, 'wb').write(pdf)\n", "except Exception as e:\n", " print(f\"An error occurred: {e}\")" ] @@ -251,15 +234,15 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 19, "id": "0de6ab5b-99c8-41e2-b709-af44b3d2b385", "metadata": { "execution": { - "iopub.execute_input": "2023-10-16T05:53:30.200354Z", - "iopub.status.busy": "2023-10-16T05:53:30.200127Z", - "iopub.status.idle": "2023-10-16T05:53:30.205621Z", - "shell.execute_reply": "2023-10-16T05:53:30.204962Z", - "shell.execute_reply.started": "2023-10-16T05:53:30.200331Z" + "iopub.execute_input": "2023-10-16T06:07:51.806281Z", + "iopub.status.busy": "2023-10-16T06:07:51.805973Z", + "iopub.status.idle": "2023-10-16T06:07:51.814963Z", + "shell.execute_reply": "2023-10-16T06:07:51.814305Z", + "shell.execute_reply.started": "2023-10-16T06:07:51.806246Z" }, "tags": [] }, @@ -273,18 +256,7 @@ } ], "source": [ - "print(f\"Webpage downloaded as PDF to {output_pdf_path}\")" - ] - }, - { - "cell_type": "markdown", - "id": "b3cfe3d5-119b-43a3-ba59-84931425c1ff", - "metadata": { - "papermill": {}, - "tags": [] - }, - "source": [ - "The webpage has been downloaded as a PDF file." + "print(f\"Webpage downloaded as PDF to {webpage_path}\")" ] }, {