From 2878256140efa495f00b79647ea87fe4f3439a5f Mon Sep 17 00:00:00 2001 From: yzqzss Date: Tue, 16 Apr 2024 20:18:25 +0800 Subject: [PATCH] handle `MWUnknownContentModelException` when using "--xmlrevisions" option by deleting the revision's content --- .../dump/page/xmlexport/page_xml_api.py | 4 +- .../dump/page/xmlrev/xml_revisions.py | 37 +++++++++++++++++-- .../dump/page/xmlrev/xml_revisions_page.py | 5 ++- wikiteam3/dumpgenerator/exceptions.py | 18 +++++++++ 4 files changed, 56 insertions(+), 8 deletions(-) diff --git a/wikiteam3/dumpgenerator/dump/page/xmlexport/page_xml_api.py b/wikiteam3/dumpgenerator/dump/page/xmlexport/page_xml_api.py index 26922067..fa8631c5 100644 --- a/wikiteam3/dumpgenerator/dump/page/xmlexport/page_xml_api.py +++ b/wikiteam3/dumpgenerator/dump/page/xmlexport/page_xml_api.py @@ -11,6 +11,7 @@ from wikiteam3.dumpgenerator.config import Config from wikiteam3.dumpgenerator.exceptions import PageMissingError, ExportAbortedError from wikiteam3.dumpgenerator.log import log_error +from wikiteam3.utils.util import underscore def reconstructRevisions(root: ET.Element): @@ -151,8 +152,7 @@ def getXMLPageWithApi(config: Config, title="", verbose=True, *, session: reques if params['curonly'] is set, then using export&exportwrap to export """ - title_ = title - title_ = re.sub(' ', '_', title_) + title_ = underscore(title) # do not convert & into %26, title_ = re.sub('&', '%26', title_) # action=query&rvlimit=50&format=xml&prop=revisions&titles=TITLE_HERE # &rvprop=timestamp%7Cuser%7Ccomment%7Ccontent%7Cids%7Cuserid%7Csha1%7Csize diff --git a/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions.py b/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions.py index 001f947c..ad88f509 100644 --- a/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions.py +++ b/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions.py @@ -9,7 +9,8 @@ import mwclient.errors import requests -from wikiteam3.dumpgenerator.exceptions import PageMissingError +from wikiteam3.dumpgenerator.cli.delay import Delay +from wikiteam3.dumpgenerator.exceptions import MWUnknownContentModelException, PageMissingError from wikiteam3.dumpgenerator.log import log_error from wikiteam3.dumpgenerator.api.namespaces import getNamespacesAPI from wikiteam3.dumpgenerator.api.page_titles import read_titles @@ -60,9 +61,10 @@ def getXMLRevisionsByAllRevisions(config: Config, session: requests.Session, sit # We have to build the XML manually... # Skip flags, presumably needed to add which is in the schema. # Also missing: parentid and contentformat. + ARV_PROP = "ids|timestamp|user|userid|size|sha1|contentmodel|comment|content|flags" arv_params[ "arvprop" - ] = "ids|timestamp|user|userid|size|sha1|contentmodel|comment|content|flags" + ] = ARV_PROP print( "Trying to get wikitext from the allrevisions API and to build the XML" ) @@ -72,6 +74,33 @@ def getXMLRevisionsByAllRevisions(config: Config, session: requests.Session, sit allrevs_response = site.api( http_method=config.http_method, **arv_params ) + # reset params if the response is OK + arv_params["arvprop"] = ARV_PROP + if arv_params["arvlimit"] != config.api_chunksize: + arv_params["arvlimit"] = min(arv_params["arvlimit"] * 2, config.api_chunksize) + print(f"INFO: response is OK, increasing arvlimit to {arv_params['arvlimit']}") + except mwclient.errors.APIError as e: + if e.code == MWUnknownContentModelException.error_code: + if arv_params['arvlimit'] != 1: + # let's retry with arvlimit=1 to retrieve good revisions as much as possible + print("WARNING: API returned MWUnknownContentModelException. retrying with arvlimit=1 (revision by revision)") + arv_params["arvlimit"] = 1 + Delay(config=config) + continue + elif '|content' in arv_params["arvprop"]: + log_error(config=config, to_stdout=True, + text=f"ERROR: API returned MWUnknownContentModelException on arvcontinue={arv_params.get('arvcontinue', '')}, " + + "retried with arvlimit=1 and still failed. retrying without arvprop=content. " + + '(wikiteam3 would mark the revision as " in the xmldump)' + ) + arv_params["arvprop"] = ARV_PROP.replace('|content', '') + Delay(config=config) + continue + else: + assert False, "This should not happen" + else: + raise + except requests.exceptions.HTTPError as e: if ( e.response.status_code == 405 @@ -79,6 +108,7 @@ def getXMLRevisionsByAllRevisions(config: Config, session: requests.Session, sit ): print("POST request to the API failed, retrying with GET") config.http_method = "GET" + Delay(config=config) continue else: raise @@ -98,6 +128,7 @@ def getXMLRevisionsByAllRevisions(config: Config, session: requests.Session, sit ): print("POST request to the API failed (got HTML), retrying with GET") config.http_method = "GET" + Delay(config=config) continue else: raise @@ -377,8 +408,6 @@ def getXMLRevisionsByTitles(config: Config, session: requests.Session, site: mwc def getXMLRevisions(config: Config, session: requests.Session, lastPage=None, useAllrevision=True): # FIXME: actually figure out the various strategies for each MediaWiki version apiurl = urlparse(config.api) - # FIXME: force the protocol we asked for! Or don't verify SSL if we asked HTTP? - # https://github.com/WikiTeam/wikiteam/issues/358 site = mwclient.Site( apiurl.netloc, apiurl.path.replace("api.php", ""), scheme=apiurl.scheme, pool=session ) diff --git a/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions_page.py b/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions_page.py index 45c66cc1..2c465ace 100644 --- a/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions_page.py +++ b/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions_page.py @@ -56,7 +56,7 @@ def make_xml_from_page(page: Dict, arvcontinue: Optional[str] = None) -> str: E.timestamp(rev["timestamp"]),] # The text, user, comment, sha1 may be deleted/suppressed - if (('texthidden' in rev) or ('textmissing' in rev)): + if (('texthidden' in rev) or ('textmissing' in rev)) or ('*' not in rev): print("Warning: text missing/hidden in pageid %d revid %d" % (page['pageid'], rev['revid'])) revision.append(E.text(**{ 'bytes': str(size), @@ -117,6 +117,7 @@ def make_xml_from_page(page: Dict, arvcontinue: Optional[str] = None) -> str: _revision.append(elem) p.append(_revision) except KeyError as e: - print(e) + import traceback + traceback.print_exc() raise PageMissingError(page["title"], e) return etree.tostring(p, pretty_print=True, encoding="unicode") diff --git a/wikiteam3/dumpgenerator/exceptions.py b/wikiteam3/dumpgenerator/exceptions.py index 80d17cd0..d7cef602 100644 --- a/wikiteam3/dumpgenerator/exceptions.py +++ b/wikiteam3/dumpgenerator/exceptions.py @@ -1,6 +1,24 @@ from typing import Optional +class InternalApiError(Exception): + """ base class for all internal API errors """ + error_code = "internal_api_error_*" + errorclass = "MW*Exception" + common_cause = "reason a; reason b; reason c" + samples = ["url"] + + +class MWUnknownContentModelException(InternalApiError): + error_code = "internal_api_error_MWUnknownContentModelException" + errorclass = "MWUnknownContentModelException" + common_cause = "The content model xxxxx is not registered on this wiki; Some extensions use special content models for their own purposes, but they did not register a handler to export their content (?)" + samples = [ + "https://web.archive.org/web/20231015082428id_/https://www.wikidoc.org/api.php?titles=Talk%3AMain_Page&action=query&format=xml&prop=revisions&rvprop=timestamp|user|comment|content|ids|flags|size|userid|sha1|contentmodel&rvlimit=50", + "https://web.archive.org/web/20231015082600id_/https://www.wikidoc.org/api.php?titles=Talk%3AMain_Page&action=query&format=json&prop=revisions&rvprop=timestamp|user|comment|content|ids|flags|size|userid|sha1|contentmodel&rvlimit=50" + ] + + class PageMissingError(Exception): def __init__(self, title, xml): self.title = title