diff --git a/Python/Python_Download_Webpage_as_PDF.ipynb b/Python/Python_Download_Webpage_as_PDF.ipynb new file mode 100644 index 0000000000..729af19ced --- /dev/null +++ b/Python/Python_Download_Webpage_as_PDF.ipynb @@ -0,0 +1,313 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "f10d5e39-7e5d-476f-8e2b-3a14ec878773", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "\"Naas\"" + ] + }, + { + "cell_type": "markdown", + "id": "699b8293-3e19-4e8c-9732-668e002efe19", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "# Python - Download Webpage as PDF" + ] + }, + { + "cell_type": "markdown", + "id": "acc3b5ee-6154-4199-9534-da3504743cc3", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "**Tags:** #python #webpage #download #pdf #library #urllib" + ] + }, + { + "cell_type": "markdown", + "id": "da2f66ee-bab4-483a-bc79-cb42ace8d2a1", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "**Author:** [Manasvi Agarwal](https://www.linkedin.com/in/mnsv/)" + ] + }, + { + "cell_type": "markdown", + "id": "52d2a5d6-53c1-4da3-ba02-b2cc2c491f97", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "**Last update:** 2023-10-16 (Created: 2023-10-09)" + ] + }, + { + "cell_type": "markdown", + "id": "a3679e7a-44b8-48cc-92f6-faaa9ce4e088", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "**Description:** This notebook downloads a webpage as a PDF file using the Python library weasyprint. It is useful for organizations that need to save webpages for archiving or for further analysis." + ] + }, + { + "cell_type": "markdown", + "id": "f0de625f-9545-44bb-9ef6-4819c2d6bc50", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "**References:**\n", + "- [weasyprint Documentation](https://doc.courtbouillon.org/weasyprint/stable/)" + ] + }, + { + "cell_type": "markdown", + "id": "309e478b-03e6-4c0e-b0ac-0ce68f3eef2c", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "## Input" + ] + }, + { + "cell_type": "markdown", + "id": "8283e962-8c3a-40d8-b3a3-9a2f043436aa", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "### Import libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "bf4d6821-ca76-4cad-92d9-2535f4c877a4", + "metadata": { + "execution": { + "iopub.execute_input": "2023-10-16T06:07:44.816453Z", + "iopub.status.busy": "2023-10-16T06:07:44.816157Z", + "iopub.status.idle": "2023-10-16T06:07:44.825636Z", + "shell.execute_reply": "2023-10-16T06:07:44.824969Z", + "shell.execute_reply.started": "2023-10-16T06:07:44.816420Z" + }, + "papermill": {}, + "tags": [] + }, + "outputs": [], + "source": [ + "from weasyprint import HTML" + ] + }, + { + "cell_type": "markdown", + "id": "699e12f4-262d-45bc-a935-9f27070c02e9", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "### Setup variables\n", + "- `url`: URL of the webpage to be downloaded\n", + "- `webpage_path`: Path where output PDF file will be downloaded" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "737c5d30-d65e-4e86-86aa-dc832c69fe44", + "metadata": { + "execution": { + "iopub.execute_input": "2023-10-16T06:07:46.582693Z", + "iopub.status.busy": "2023-10-16T06:07:46.582387Z", + "iopub.status.idle": "2023-10-16T06:07:46.585926Z", + "shell.execute_reply": "2023-10-16T06:07:46.585292Z", + "shell.execute_reply.started": "2023-10-16T06:07:46.582656Z" + }, + "papermill": {}, + "tags": [] + }, + "outputs": [], + "source": [ + "url = \"https://www.example.com\"\n", + "webpage_path = 'webpage.pdf'" + ] + }, + { + "cell_type": "markdown", + "id": "fe70d837-f74e-49d0-b1c3-1821347fb592", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "## Model" + ] + }, + { + "cell_type": "markdown", + "id": "31968d51-d39e-4418-a7c7-9273949ff799", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "### Download webpage as PDF" + ] + }, + { + "cell_type": "markdown", + "id": "b0617cd8-a7ec-4101-9b3f-f70ac1543621", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "Download the webpage as a PDF file using the `HTML write_pdf` function from the `weasyprint` library." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "a3b86f57-f170-4832-ad36-58cd1d0e25e6", + "metadata": { + "execution": { + "iopub.execute_input": "2023-10-16T06:07:49.397187Z", + "iopub.status.busy": "2023-10-16T06:07:49.396824Z", + "iopub.status.idle": "2023-10-16T06:07:49.973517Z", + "shell.execute_reply": "2023-10-16T06:07:49.972747Z", + "shell.execute_reply.started": "2023-10-16T06:07:49.397149Z" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "try:\n", + " pdf = HTML(url).write_pdf()\n", + " open(webpage_path, 'wb').write(pdf)\n", + "except Exception as e:\n", + " print(f\"An error occurred: {e}\")" + ] + }, + { + "cell_type": "markdown", + "id": "aefdb14c-f36f-4417-aa99-1b37e26be06b", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "## Output" + ] + }, + { + "cell_type": "markdown", + "id": "f7397b8d-6152-4d35-8f2f-01995e5a8858", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "### Display result" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "0de6ab5b-99c8-41e2-b709-af44b3d2b385", + "metadata": { + "execution": { + "iopub.execute_input": "2023-10-16T06:07:51.806281Z", + "iopub.status.busy": "2023-10-16T06:07:51.805973Z", + "iopub.status.idle": "2023-10-16T06:07:51.814963Z", + "shell.execute_reply": "2023-10-16T06:07:51.814305Z", + "shell.execute_reply.started": "2023-10-16T06:07:51.806246Z" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Webpage downloaded as PDF to webpage.pdf\n" + ] + } + ], + "source": [ + "print(f\"Webpage downloaded as PDF to {webpage_path}\")" + ] + }, + { + "cell_type": "markdown", + "id": "be9c4219-79a5-4668-97f4-38ea61ffbbc2", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + " " + ] + }, + { + "cell_type": "markdown", + "id": "b91cb912-ff76-428c-8e57-56341e93181f", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + " " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}