Skip to content

Commit

Permalink
improve docs for dl module
Browse files Browse the repository at this point in the history
  • Loading branch information
freddyheppell committed Jul 10, 2024
1 parent 3f80b86 commit 325f167
Show file tree
Hide file tree
Showing 12 changed files with 98 additions and 217 deletions.
12 changes: 6 additions & 6 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -77,12 +77,12 @@ ignore = [
"D103", # Ignore method docstring errors in tests
"PD901", # Allow `df` variable name in tests
]
"src/wpextract/dl/*" = [
"D415",
"D103",
"D101",
"D107"
]
#"src/wpextract/dl/*" = [
# "D415",
# "D103",
# "D101",
# "D107"
#]

[tool.ruff.lint.pydocstyle]
convention = "google"
Expand Down
1 change: 1 addition & 0 deletions src/wpextract/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
from wpextract.downloader import WPDownloader as WPDownloader

from .extract import WPExtractor as WPExtractor
2 changes: 1 addition & 1 deletion src/wpextract/cli/_dl.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from argparse import Namespace

from wpextract.cli._shared import _register_shared
from wpextract.downloader import WPDownloader
from wpextract.dl.requestsession import RequestSession
from wpextract.downloader import WPDownloader
from wpextract.util.args import empty_directory

dl_types = ["categories", "media", "pages", "posts", "tags", "users"]
Expand Down
2 changes: 1 addition & 1 deletion src/wpextract/dl/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from .requestsession import RequestSession as RequestSession
from .requestsession import AuthorizationType as AuthorizationType
from .requestsession import RequestSession as RequestSession
28 changes: 3 additions & 25 deletions src/wpextract/dl/exceptions.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,16 @@
"""Copyright (c) 2018-2020 Mickaël "Kilawyn" Walter
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
"""


class NoWordpressApi(Exception):
"""No API is available at the given URL"""
"""No API is available at the given URL."""

pass


class WordPressApiNotV2(Exception):
"""The WordPress V2 API is not available"""
"""The WordPress V2 API is not available."""

pass


class NSNotFoundException(Exception):
"""The specified namespace does not exist"""
"""The specified namespace does not exist."""

pass
124 changes: 33 additions & 91 deletions src/wpextract/dl/exporter.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,8 @@
"""Copyright (c) 2018-2020 Mickaël "Kilawyn" Walter
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
"""

import copy
import html
import json
import logging
import os
from typing import List
from urllib import parse as urlparse

from tqdm.auto import tqdm
Expand All @@ -32,25 +11,24 @@


class Exporter:
"""Utility functions to export data"""
"""Utility functions to export data."""

JSON = 1
"""Represents the JSON format for format choice"""
CHUNK_SIZE = 2048
"""The size of chunks to download large files"""

@staticmethod
def download_media(session: RequestSession, media, output_folder):
"""Downloads the media files based on the given URLs
def download_media(
session: RequestSession, media: List[str], output_folder: str
) -> int:
"""Downloads the media files based on the given URLs.
Args:
session: the request session to use
media: the URLs as a list
output_folder: the path to the folder where the files are
being saved, it is assumed as existing
output_folder: the path to the folder where the files are being saved, it is assumed as existing
Returns:
the number of files wrote
the number of files written
"""
files_number = 0
for m in tqdm(media, unit="media"):
Expand Down Expand Up @@ -147,34 +125,27 @@ def setup_export(vlist, parameters_to_unescape):
return exported_list

@staticmethod
def write_file(filename, fmt, data):
"""Writes content to the given file using the given format.
def write_file(filename, data):
"""Writes content to the given file in JSON format.
The key mapping must be a dict of keys or lists of keys to ensure proper mapping.
Args:
filename: the path of the file
fmt: the format of the file
data: the actual data to export
"""
with open(filename, "w", encoding="utf-8") as f:
if fmt == Exporter.JSON:
# The JSON format is straightforward, we dump the flattened objects to JSON
json.dump(data, f, ensure_ascii=False, indent=4)
else:
raise ValueError("Unknown export format")
json.dump(data, f, ensure_ascii=False, indent=4)

@staticmethod
def export_posts(
posts,
fmt,
filename,
posts: List[dict],
filename: str,
):
"""Exports posts in specified format to specified file
"""Exports posts to the specified file.
Args:
posts: the posts to export
fmt: the export format (JSON or CSV)
filename: filename to use
Returns:
Expand All @@ -185,16 +156,15 @@ def export_posts(
[["title", "rendered"], ["content", "rendered"], ["excerpt", "rendered"]],
)

Exporter.write_file(filename, fmt, exported_posts)
Exporter.write_file(filename, exported_posts)
return len(exported_posts)

@staticmethod
def export_categories(categories, fmt, filename):
"""Exports categories in specified format to specified file.
def export_categories(categories, filename):
"""Exports categories to the specified file.
Args:
categories: the categories to export
fmt: the export format (JSON or CSV)
filename: the path to the file to write
Returns:
Expand All @@ -205,51 +175,46 @@ def export_categories(categories, fmt, filename):
[],
)

Exporter.write_file(filename, fmt, exported_categories)
Exporter.write_file(filename, exported_categories)
return len(exported_categories)

@staticmethod
def export_tags(tags, fmt, filename):
"""Exports tags in specified format to specified file
def export_tags(tags, filename):
"""Exports tags to the specified file.
Args:
tags: the tags to export
fmt: the export format (JSON or CSV)
filename: the path to the file to write
Returns:
the length of the list written to the file
"""
exported_tags = tags # It seems that no modification will be done for this one, so no deepcopy
Exporter.write_file(filename, fmt, exported_tags)
Exporter.write_file(filename, exported_tags)
return len(exported_tags)

@staticmethod
def export_users(users, fmt, filename):
"""Exports users in specified format to specified file.
def export_users(users, filename):
"""Exports users to the specified file.
Args:
users: the users to export
fmt: the export format (JSON or CSV)
filename: the path to the file to write
Returns:
the length of the list written to the file
"""
exported_users = users # It seems that no modification will be done for this one, so no deepcopy
Exporter.write_file(filename, fmt, exported_users)
Exporter.write_file(filename, exported_users)
return len(exported_users)

@staticmethod
def export_pages(pages, fmt, filename, parent_pages=None, users=None):
"""Exports pages in specified format to specified file.
def export_pages(pages, filename):
"""Exports pages to the specified file.
Args:
pages: the pages to export
fmt: the export format (JSON or CSV)
filename: the path to the file to write
parent_pages: the list of all cached pages, to get parents
users: the list of all cached users, to get users
Returns:
the length of the list written to the file
Expand All @@ -264,18 +229,16 @@ def export_pages(pages, fmt, filename, parent_pages=None, users=None):
],
)

Exporter.write_file(filename, fmt, exported_pages)
Exporter.write_file(filename, exported_pages)
return len(exported_pages)

@staticmethod
def export_media(media, fmt, filename, users=None):
"""Exports media in specified format to specified file.
def export_media(media, filename):
"""Exports media to the specified file.
Args:
media: the media to export
fmt: the export format (JSON or CSV)
filename: file to export to
users: a list of users to associate them with author ids
Returns:
the length of the list written to the file
Expand All @@ -290,38 +253,17 @@ def export_media(media, fmt, filename, users=None):
],
)

Exporter.write_file(filename, fmt, exported_media)
Exporter.write_file(filename, exported_media)
return len(exported_media)

@staticmethod
def export_namespaces(namespaces, fmt, filename):
"""**NOT IMPLEMENTED** Exports namespaces in specified format to specified file.
Args:
namespaces: the namespaces to export
fmt: the export format (JSON or CSV)
filename: file to export to
Returns:
the length of the list written to the file
"""
logging.info("Namespaces export not available yet")
return 0

# FIXME to be refactored
@staticmethod
def export_comments_interactive(
comments, fmt, filename, parent_posts=None, users=None
):
"""Exports comments in specified format to specified file.
def export_comments_interactive(comments, filename):
"""Exports comments to the specified file.
Args:
comments: the comments to export
fmt: the export format (JSON or CSV)
filename: the path to the file to write
parent_posts: the list of all cached posts, to get parent
posts (not used yet because this could be too verbose)
users: the list of all cached users, to get users
Returns:
the length of the list written to the file
Expand All @@ -331,5 +273,5 @@ def export_comments_interactive(
[["content", "rendered"]],
)

Exporter.write_file(filename, fmt, exported_comments)
Exporter.write_file(filename, exported_comments)
return len(exported_comments)
Loading

0 comments on commit 325f167

Please sign in to comment.