Skip to content

Commit

Permalink
feat: increasemental xmldump (--xmlrevisions) PoC (#24)
Browse files Browse the repository at this point in the history
  • Loading branch information
yzqzss authored Jul 23, 2024
1 parent 5a91db1 commit 1cdbd9c
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 2 deletions.
13 changes: 12 additions & 1 deletion wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from datetime import datetime
import os
import sys
import time
from typing import Dict, List, Optional
Expand All @@ -17,15 +18,25 @@
from wikiteam3.dumpgenerator.dump.page.xmlrev.xml_revisions_page import \
make_xml_from_page, make_xml_page_from_raw
from wikiteam3.dumpgenerator.config import Config
from wikiteam3.utils.util import XMLRIVISIONS_INCREMENTAL_DUMP_MARK, mark_as_done

ALL_NAMESPACE = -1

def getXMLRevisionsByAllRevisions(config: Config, session: requests.Session, site: mwclient.Site, nscontinue=None, arvcontinue=None):
def getXMLRevisionsByAllRevisions(config: Config, session: requests.Session, site: mwclient.Site, nscontinue=None, arvcontinue: Optional[str]=None):
if "all" not in config.namespaces:
namespaces = config.namespaces
else:
# namespaces, namespacenames = getNamespacesAPI(config=config, session=session)
namespaces = [ALL_NAMESPACE] # magic number refers to "all"

# <- increasement xmldump
if env_arvcontinue := os.getenv("ARVCONTINUE", None):
mark_as_done(config, XMLRIVISIONS_INCREMENTAL_DUMP_MARK)
print(f"Using [env]ARVCONTINUE={env_arvcontinue}")
arvcontinue = env_arvcontinue
print("\n\n[NOTE] DO NOT use wikiteam3uploader to upload incremental xmldump to Internet Archive, we haven't implemented it yet\n\n")
# ->

_nscontinue_input = nscontinue
_arvcontinue_input = arvcontinue
del nscontinue
Expand Down
21 changes: 21 additions & 0 deletions wikiteam3/tools/get_arvcontinue.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import argparse

from wikiteam3.dumpgenerator.dump.xmldump.xml_truncate import parse_last_page_chunk, truncateXMLDump

def parse_args():
parser = argparse.ArgumentParser(description="Get the next arvcontinue value")
parser.add_argument("xml", help="XML file")
args = parser.parse_args()
return args

def main():
args = parse_args()
xmlfile: str = args.xml
lastPageChunk = truncateXMLDump(xmlfile, dryrun=True)
lastPage = parse_last_page_chunk(lastPageChunk)
assert lastPage is not None
lastArvcontinue = lastPage.attrib['arvcontinue']
print(f'ARVCONTINUE="{lastArvcontinue}"')

if __name__ == "__main__":
main()
3 changes: 2 additions & 1 deletion wikiteam3/uploader/uploader.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from wikiteam3.utils import url2prefix_from_config, sha1sum
from wikiteam3.uploader.compresser import ZstdCompressor, SevenZipCompressor
from wikiteam3.utils.ia_checker import ia_s3_tasks_load_avg
from wikiteam3.utils.util import ALL_DUMPED_MARK, UPLOADED_MARK, is_empty_dir, mark_as_done, is_markfile_exists
from wikiteam3.utils.util import ALL_DUMPED_MARK, UPLOADED_MARK, XMLRIVISIONS_INCREMENTAL_DUMP_MARK, is_empty_dir, mark_as_done, is_markfile_exists

DEFAULT_COLLECTION = 'opensource'
TEST_COLLECTION = 'test_collection'
Expand Down Expand Up @@ -384,6 +384,7 @@ def upload(arg: Args):
assert wikidump_dir == Path(config.path).resolve()

assert is_markfile_exists(config, ALL_DUMPED_MARK), "Imcomplete dump"
assert is_markfile_exists(config, XMLRIVISIONS_INCREMENTAL_DUMP_MARK), "xmlrevisions incremental dump is not supported yet"
if is_markfile_exists(config, UPLOADED_MARK):
print(f"Already uploaded to IA ({UPLOADED_MARK} exists), bye!")
return
Expand Down
1 change: 1 addition & 0 deletions wikiteam3/utils/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

ALL_DUMPED_MARK = "all_dumped.mark"
UPLOADED_MARK = 'uploaded_to_IA.mark'
XMLRIVISIONS_INCREMENTAL_DUMP_MARK = 'xmlrevisions_incremental_dump.mark'


def underscore(text: str) -> str:
Expand Down

0 comments on commit 1cdbd9c

Please sign in to comment.