diff --git a/wikiteam3/dumpgenerator/dump/page/xmlexport/page_xml_api.py b/wikiteam3/dumpgenerator/dump/page/xmlexport/page_xml_api.py
index 26922067..fa8631c5 100644
--- a/wikiteam3/dumpgenerator/dump/page/xmlexport/page_xml_api.py
+++ b/wikiteam3/dumpgenerator/dump/page/xmlexport/page_xml_api.py
@@ -11,6 +11,7 @@
from wikiteam3.dumpgenerator.config import Config
from wikiteam3.dumpgenerator.exceptions import PageMissingError, ExportAbortedError
from wikiteam3.dumpgenerator.log import log_error
+from wikiteam3.utils.util import underscore
def reconstructRevisions(root: ET.Element):
@@ -151,8 +152,7 @@ def getXMLPageWithApi(config: Config, title="", verbose=True, *, session: reques
if params['curonly'] is set, then using export&exportwrap to export
"""
- title_ = title
- title_ = re.sub(' ', '_', title_)
+ title_ = underscore(title)
# do not convert & into %26, title_ = re.sub('&', '%26', title_)
# action=query&rvlimit=50&format=xml&prop=revisions&titles=TITLE_HERE
# &rvprop=timestamp%7Cuser%7Ccomment%7Ccontent%7Cids%7Cuserid%7Csha1%7Csize
diff --git a/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions.py b/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions.py
index 001f947c..ad88f509 100644
--- a/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions.py
+++ b/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions.py
@@ -9,7 +9,8 @@
import mwclient.errors
import requests
-from wikiteam3.dumpgenerator.exceptions import PageMissingError
+from wikiteam3.dumpgenerator.cli.delay import Delay
+from wikiteam3.dumpgenerator.exceptions import MWUnknownContentModelException, PageMissingError
from wikiteam3.dumpgenerator.log import log_error
from wikiteam3.dumpgenerator.api.namespaces import getNamespacesAPI
from wikiteam3.dumpgenerator.api.page_titles import read_titles
@@ -60,9 +61,10 @@ def getXMLRevisionsByAllRevisions(config: Config, session: requests.Session, sit
# We have to build the XML manually...
# Skip flags, presumably needed to add which is in the schema.
# Also missing: parentid and contentformat.
+ ARV_PROP = "ids|timestamp|user|userid|size|sha1|contentmodel|comment|content|flags"
arv_params[
"arvprop"
- ] = "ids|timestamp|user|userid|size|sha1|contentmodel|comment|content|flags"
+ ] = ARV_PROP
print(
"Trying to get wikitext from the allrevisions API and to build the XML"
)
@@ -72,6 +74,33 @@ def getXMLRevisionsByAllRevisions(config: Config, session: requests.Session, sit
allrevs_response = site.api(
http_method=config.http_method, **arv_params
)
+ # reset params if the response is OK
+ arv_params["arvprop"] = ARV_PROP
+ if arv_params["arvlimit"] != config.api_chunksize:
+ arv_params["arvlimit"] = min(arv_params["arvlimit"] * 2, config.api_chunksize)
+ print(f"INFO: response is OK, increasing arvlimit to {arv_params['arvlimit']}")
+ except mwclient.errors.APIError as e:
+ if e.code == MWUnknownContentModelException.error_code:
+ if arv_params['arvlimit'] != 1:
+ # let's retry with arvlimit=1 to retrieve good revisions as much as possible
+ print("WARNING: API returned MWUnknownContentModelException. retrying with arvlimit=1 (revision by revision)")
+ arv_params["arvlimit"] = 1
+ Delay(config=config)
+ continue
+ elif '|content' in arv_params["arvprop"]:
+ log_error(config=config, to_stdout=True,
+ text=f"ERROR: API returned MWUnknownContentModelException on arvcontinue={arv_params.get('arvcontinue', '')}, " +
+ "retried with arvlimit=1 and still failed. retrying without arvprop=content. " +
+ '(wikiteam3 would mark the revision as " in the xmldump)'
+ )
+ arv_params["arvprop"] = ARV_PROP.replace('|content', '')
+ Delay(config=config)
+ continue
+ else:
+ assert False, "This should not happen"
+ else:
+ raise
+
except requests.exceptions.HTTPError as e:
if (
e.response.status_code == 405
@@ -79,6 +108,7 @@ def getXMLRevisionsByAllRevisions(config: Config, session: requests.Session, sit
):
print("POST request to the API failed, retrying with GET")
config.http_method = "GET"
+ Delay(config=config)
continue
else:
raise
@@ -98,6 +128,7 @@ def getXMLRevisionsByAllRevisions(config: Config, session: requests.Session, sit
):
print("POST request to the API failed (got HTML), retrying with GET")
config.http_method = "GET"
+ Delay(config=config)
continue
else:
raise
@@ -377,8 +408,6 @@ def getXMLRevisionsByTitles(config: Config, session: requests.Session, site: mwc
def getXMLRevisions(config: Config, session: requests.Session, lastPage=None, useAllrevision=True):
# FIXME: actually figure out the various strategies for each MediaWiki version
apiurl = urlparse(config.api)
- # FIXME: force the protocol we asked for! Or don't verify SSL if we asked HTTP?
- # https://github.com/WikiTeam/wikiteam/issues/358
site = mwclient.Site(
apiurl.netloc, apiurl.path.replace("api.php", ""), scheme=apiurl.scheme, pool=session
)
diff --git a/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions_page.py b/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions_page.py
index 45c66cc1..2c465ace 100644
--- a/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions_page.py
+++ b/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions_page.py
@@ -56,7 +56,7 @@ def make_xml_from_page(page: Dict, arvcontinue: Optional[str] = None) -> str:
E.timestamp(rev["timestamp"]),]
# The text, user, comment, sha1 may be deleted/suppressed
- if (('texthidden' in rev) or ('textmissing' in rev)):
+ if (('texthidden' in rev) or ('textmissing' in rev)) or ('*' not in rev):
print("Warning: text missing/hidden in pageid %d revid %d" % (page['pageid'], rev['revid']))
revision.append(E.text(**{
'bytes': str(size),
@@ -117,6 +117,7 @@ def make_xml_from_page(page: Dict, arvcontinue: Optional[str] = None) -> str:
_revision.append(elem)
p.append(_revision)
except KeyError as e:
- print(e)
+ import traceback
+ traceback.print_exc()
raise PageMissingError(page["title"], e)
return etree.tostring(p, pretty_print=True, encoding="unicode")
diff --git a/wikiteam3/dumpgenerator/exceptions.py b/wikiteam3/dumpgenerator/exceptions.py
index 80d17cd0..d7cef602 100644
--- a/wikiteam3/dumpgenerator/exceptions.py
+++ b/wikiteam3/dumpgenerator/exceptions.py
@@ -1,6 +1,24 @@
from typing import Optional
+class InternalApiError(Exception):
+ """ base class for all internal API errors """
+ error_code = "internal_api_error_*"
+ errorclass = "MW*Exception"
+ common_cause = "reason a; reason b; reason c"
+ samples = ["url"]
+
+
+class MWUnknownContentModelException(InternalApiError):
+ error_code = "internal_api_error_MWUnknownContentModelException"
+ errorclass = "MWUnknownContentModelException"
+ common_cause = "The content model xxxxx is not registered on this wiki; Some extensions use special content models for their own purposes, but they did not register a handler to export their content (?)"
+ samples = [
+ "https://web.archive.org/web/20231015082428id_/https://www.wikidoc.org/api.php?titles=Talk%3AMain_Page&action=query&format=xml&prop=revisions&rvprop=timestamp|user|comment|content|ids|flags|size|userid|sha1|contentmodel&rvlimit=50",
+ "https://web.archive.org/web/20231015082600id_/https://www.wikidoc.org/api.php?titles=Talk%3AMain_Page&action=query&format=json&prop=revisions&rvprop=timestamp|user|comment|content|ids|flags|size|userid|sha1|contentmodel&rvlimit=50"
+ ]
+
+
class PageMissingError(Exception):
def __init__(self, title, xml):
self.title = title