-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathingest.py
130 lines (96 loc) · 4.07 KB
/
ingest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import os
import re
import sqlite3
import xml.etree.ElementTree as ET
input_file = os.path.join(os.path.abspath(os.path.dirname(__file__)), './data/simplewiki-latest-pages-articles.xml')
def page_get_meta(page):
title = "Unknown"
link_txt = ""
redirect = None
for child in page:
if child.tag.endswith('title'):
title = child.text
if child.tag.endswith('redirect'):
redirect = child.get('title')
if child.tag.endswith('revision'):
for elem in child:
if elem.tag.endswith('text'):
link_txt = elem.text
if link_txt is None:
print(f'\n \033[36m{title}\033[0m has no body text')
link_txt = ""
link_pattern = re.compile(r'\[\[([^|\]]+)(?:\|[^\]]+)?\]\]')
links = link_pattern.findall(link_txt)
return (title, [link.strip() for link in links], redirect)
def Get_Article_ID(title, cursor):
# title = title.lower()
title = title[0].upper() + title[1:]
# Insert the to_article if not present
cursor.execute("INSERT OR IGNORE INTO articles (title) VALUES (?)", (title,))
# Get the to article ID
cursor.execute("SELECT id FROM articles WHERE title = ?", (title,))
return cursor.fetchone()[0]
def Ingest_Links(cursor, conn):
total_bytes = os.path.getsize(input_file)
context = ET.iterparse(input_file, events=('end',))
processed_bytes = 0
batch_size = 5000
batch = 0
for event, elem in context:
if elem.tag.endswith('page'):
title, links, redirect = page_get_meta(elem)
# Get the from article ID
from_article_id = Get_Article_ID(title, cursor)
if redirect:
to_article_id = Get_Article_ID(redirect, cursor)
cursor.execute("INSERT OR IGNORE INTO redirects (from_article_id, to_article_id) VALUES (?, ?)", (from_article_id, to_article_id))
batch += 1
else:
for link in links:
if len(link) < 1:
print(f'\n \033[36m{title}\033[0m has a blank link')
continue
to_article_id = Get_Article_ID(link, cursor)
# Insert the link between them if not already present
cursor.execute("INSERT OR IGNORE INTO links (from_article_id, to_article_id) VALUES (?, ?)", (from_article_id, to_article_id))
# Batch the commits
if batch >= batch_size:
progress = (processed_bytes / total_bytes) * 100
print(f'\r Processed: {progress:.2f}%', end='')
conn.commit()
batch = 0
batch += 1
# Free up memory by clearing the XML element after processing
processed_bytes += len(ET.tostring(elem, encoding='utf-8'))
elem.clear()
progress = (processed_bytes / total_bytes) * 100
print(f'\r Processed: {progress:.2f}%', end='')
conn.commit()
cursor.execute("SELECT COUNT(*) FROM articles")
print(f"\n\n Total articles: {cursor.fetchone()[0]}")
cursor.execute("SELECT COUNT(*) FROM links")
print(f" Total links: {cursor.fetchone()[0]}")
cursor.execute("SELECT COUNT(*) FROM redirects")
print(f" Total redirects: {cursor.fetchone()[0]}")
def main():
conn = sqlite3.connect('./data/simplewiki.db')
cursor = conn.cursor()
cursor.execute("CREATE TABLE IF NOT EXISTS articles (id INTEGER PRIMARY KEY, title TEXT UNIQUE)")
cursor.execute("CREATE TABLE IF NOT EXISTS links (from_article_id INTEGER, to_article_id INTEGER, UNIQUE(from_article_id, to_article_id))")
cursor.execute("CREATE TABLE IF NOT EXISTS redirects (from_article_id INTEGER, to_article_id INTEGER, UNIQUE(from_article_id, to_article_id))")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_links_from_to ON links(from_article_id, to_article_id)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_links_from ON links(from_article_id)")
cursor.execute("CREATE INDEX idx_links_to ON links(to_article_id)")
cursor.execute("CREATE INDEX idx_articles_id ON articles(id)")
print("\nIngesting articles:")
Ingest_Links(cursor=cursor, conn=conn)
conn.commit()
print("\nApply Redirections:")
with open('./redirect.sql', 'r') as f:
sql = f.read()
conn.executescript(sql)
print(" done")
cursor.close()
conn.close()
if __name__ == '__main__':
main()