Skip to content

Commit

Permalink
handle MWUnknownContentModelException when using "--xmlrevisions" o…
Browse files Browse the repository at this point in the history
…ption

by deleting the revision's content
  • Loading branch information
yzqzss committed Apr 16, 2024
1 parent 5e655bd commit 2878256
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 8 deletions.
4 changes: 2 additions & 2 deletions wikiteam3/dumpgenerator/dump/page/xmlexport/page_xml_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from wikiteam3.dumpgenerator.config import Config
from wikiteam3.dumpgenerator.exceptions import PageMissingError, ExportAbortedError
from wikiteam3.dumpgenerator.log import log_error
from wikiteam3.utils.util import underscore


def reconstructRevisions(root: ET.Element):
Expand Down Expand Up @@ -151,8 +152,7 @@ def getXMLPageWithApi(config: Config, title="", verbose=True, *, session: reques
if params['curonly'] is set, then using export&exportwrap to export
"""

title_ = title
title_ = re.sub(' ', '_', title_)
title_ = underscore(title)
# do not convert & into %26, title_ = re.sub('&', '%26', title_)
# action=query&rvlimit=50&format=xml&prop=revisions&titles=TITLE_HERE
# &rvprop=timestamp%7Cuser%7Ccomment%7Ccontent%7Cids%7Cuserid%7Csha1%7Csize
Expand Down
37 changes: 33 additions & 4 deletions wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
import mwclient.errors
import requests

from wikiteam3.dumpgenerator.exceptions import PageMissingError
from wikiteam3.dumpgenerator.cli.delay import Delay
from wikiteam3.dumpgenerator.exceptions import MWUnknownContentModelException, PageMissingError
from wikiteam3.dumpgenerator.log import log_error
from wikiteam3.dumpgenerator.api.namespaces import getNamespacesAPI
from wikiteam3.dumpgenerator.api.page_titles import read_titles
Expand Down Expand Up @@ -60,9 +61,10 @@ def getXMLRevisionsByAllRevisions(config: Config, session: requests.Session, sit
# We have to build the XML manually...
# Skip flags, presumably needed to add <minor/> which is in the schema.
# Also missing: parentid and contentformat.
ARV_PROP = "ids|timestamp|user|userid|size|sha1|contentmodel|comment|content|flags"
arv_params[
"arvprop"
] = "ids|timestamp|user|userid|size|sha1|contentmodel|comment|content|flags"
] = ARV_PROP
print(
"Trying to get wikitext from the allrevisions API and to build the XML"
)
Expand All @@ -72,13 +74,41 @@ def getXMLRevisionsByAllRevisions(config: Config, session: requests.Session, sit
allrevs_response = site.api(
http_method=config.http_method, **arv_params
)
# reset params if the response is OK
arv_params["arvprop"] = ARV_PROP
if arv_params["arvlimit"] != config.api_chunksize:
arv_params["arvlimit"] = min(arv_params["arvlimit"] * 2, config.api_chunksize)
print(f"INFO: response is OK, increasing arvlimit to {arv_params['arvlimit']}")
except mwclient.errors.APIError as e:
if e.code == MWUnknownContentModelException.error_code:
if arv_params['arvlimit'] != 1:
# let's retry with arvlimit=1 to retrieve good revisions as much as possible
print("WARNING: API returned MWUnknownContentModelException. retrying with arvlimit=1 (revision by revision)")
arv_params["arvlimit"] = 1
Delay(config=config)
continue
elif '|content' in arv_params["arvprop"]:
log_error(config=config, to_stdout=True,
text=f"ERROR: API returned MWUnknownContentModelException on arvcontinue={arv_params.get('arvcontinue', '')}, " +
"retried with arvlimit=1 and still failed. retrying without arvprop=content. " +
'(wikiteam3 would mark the revision as "<text deleted="deletetd"> in the xmldump)'
)
arv_params["arvprop"] = ARV_PROP.replace('|content', '')
Delay(config=config)
continue
else:
assert False, "This should not happen"
else:
raise

except requests.exceptions.HTTPError as e:
if (
e.response.status_code == 405
and config.http_method == "POST"
):
print("POST request to the API failed, retrying with GET")
config.http_method = "GET"
Delay(config=config)
continue
else:
raise
Expand All @@ -98,6 +128,7 @@ def getXMLRevisionsByAllRevisions(config: Config, session: requests.Session, sit
):
print("POST request to the API failed (got HTML), retrying with GET")
config.http_method = "GET"
Delay(config=config)
continue
else:
raise
Expand Down Expand Up @@ -377,8 +408,6 @@ def getXMLRevisionsByTitles(config: Config, session: requests.Session, site: mwc
def getXMLRevisions(config: Config, session: requests.Session, lastPage=None, useAllrevision=True):
# FIXME: actually figure out the various strategies for each MediaWiki version
apiurl = urlparse(config.api)
# FIXME: force the protocol we asked for! Or don't verify SSL if we asked HTTP?
# https://github.com/WikiTeam/wikiteam/issues/358
site = mwclient.Site(
apiurl.netloc, apiurl.path.replace("api.php", ""), scheme=apiurl.scheme, pool=session
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def make_xml_from_page(page: Dict, arvcontinue: Optional[str] = None) -> str:
E.timestamp(rev["timestamp"]),]

# The text, user, comment, sha1 may be deleted/suppressed
if (('texthidden' in rev) or ('textmissing' in rev)):
if (('texthidden' in rev) or ('textmissing' in rev)) or ('*' not in rev):
print("Warning: text missing/hidden in pageid %d revid %d" % (page['pageid'], rev['revid']))
revision.append(E.text(**{
'bytes': str(size),
Expand Down Expand Up @@ -117,6 +117,7 @@ def make_xml_from_page(page: Dict, arvcontinue: Optional[str] = None) -> str:
_revision.append(elem)
p.append(_revision)
except KeyError as e:
print(e)
import traceback
traceback.print_exc()
raise PageMissingError(page["title"], e)
return etree.tostring(p, pretty_print=True, encoding="unicode")
18 changes: 18 additions & 0 deletions wikiteam3/dumpgenerator/exceptions.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,24 @@
from typing import Optional


class InternalApiError(Exception):
""" base class for all internal API errors """
error_code = "internal_api_error_*"
errorclass = "MW*Exception"
common_cause = "reason a; reason b; reason c"
samples = ["url"]


class MWUnknownContentModelException(InternalApiError):
error_code = "internal_api_error_MWUnknownContentModelException"
errorclass = "MWUnknownContentModelException"
common_cause = "The content model xxxxx is not registered on this wiki; Some extensions use special content models for their own purposes, but they did not register a handler to export their content (?)"
samples = [
"https://web.archive.org/web/20231015082428id_/https://www.wikidoc.org/api.php?titles=Talk%3AMain_Page&action=query&format=xml&prop=revisions&rvprop=timestamp|user|comment|content|ids|flags|size|userid|sha1|contentmodel&rvlimit=50",
"https://web.archive.org/web/20231015082600id_/https://www.wikidoc.org/api.php?titles=Talk%3AMain_Page&action=query&format=json&prop=revisions&rvprop=timestamp|user|comment|content|ids|flags|size|userid|sha1|contentmodel&rvlimit=50"
]


class PageMissingError(Exception):
def __init__(self, title, xml):
self.title = title
Expand Down

0 comments on commit 2878256

Please sign in to comment.