Skip to content
This repository was archived by the owner on Apr 28, 2025. It is now read-only.

Commit 841e65e

Browse files
committed
add a sync function for building a sqlite db
1 parent 0cf5ad4 commit 841e65e

File tree

3 files changed

+109
-3
lines changed

3 files changed

+109
-3
lines changed

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,7 @@ __pycache__/
2020
/dist/
2121
/build/
2222
/sysrev.egg-info/
23+
24+
scratch.py
25+
.env
26+
.sr

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
setup(
88
name='sysrev',
9-
version='1.2.4',
9+
version='1.3.1',
1010
description='get sysrev project data and use the sysrev api',
1111
long_description=long_description,
1212
long_description_content_type='text/markdown', # Specify the content type here

sysrev/client.py

Lines changed: 104 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import requests
1+
import requests, sqlite3, pathlib, pandas as pd, json, tqdm
22

33
class LabelTransformer:
44

@@ -25,19 +25,121 @@ def transform_label(self, label_type, label_value):
2525
return self.handle_categorical_or_string(label_value)
2626
else:
2727
raise ValueError("Invalid label type")
28+
29+
class Synchronizer:
30+
31+
def create_sqlite_db():
32+
pathlib.Path(".sr").mkdir(exist_ok=True)
33+
conn = sqlite3.connect('.sr/sr.sqlite')
34+
c = conn.cursor()
35+
36+
# Create article_data table first
37+
c.execute('''
38+
CREATE TABLE IF NOT EXISTS article_data (
39+
primary_title TEXT,
40+
consensus INTEGER,
41+
article_id TEXT PRIMARY KEY,
42+
updated_time TEXT,
43+
notes TEXT,
44+
resolve INTEGER
45+
);
46+
''')
47+
48+
# Create labels table
49+
c.execute('''
50+
CREATE TABLE IF NOT EXISTS labels (
51+
label_id INTEGER PRIMARY KEY,
52+
label_id_local TEXT,
53+
category TEXT,
54+
definition TEXT,
55+
name TEXT,
56+
consensus INTEGER,
57+
question TEXT,
58+
project_ordering INTEGER,
59+
short_label TEXT,
60+
label_id_global TEXT,
61+
root_label_id_local TEXT,
62+
global_label_id TEXT,
63+
project_id INTEGER,
64+
enabled INTEGER,
65+
value_type TEXT,
66+
required INTEGER,
67+
owner_project_id INTEGER
68+
);
69+
''')
70+
71+
# Create article_label table with foreign key references to both labels and article_data
72+
c.execute('''
73+
CREATE TABLE IF NOT EXISTS article_label (
74+
article_id TEXT,
75+
label_id INTEGER,
76+
user_id INTEGER,
77+
answer TEXT,
78+
inclusion INTEGER,
79+
updated_time TEXT,
80+
confirm_time TEXT,
81+
resolve INTEGER,
82+
PRIMARY KEY (article_id, label_id),
83+
FOREIGN KEY (label_id) REFERENCES labels (label_id),
84+
FOREIGN KEY (article_id) REFERENCES article_data (article_id)
85+
);
86+
''')
87+
88+
# Indexes for improved query performance
89+
c.execute('CREATE INDEX IF NOT EXISTS idx_labels_project_id ON labels (project_id);')
90+
c.execute('CREATE INDEX IF NOT EXISTS idx_article_label_user_id ON article_label (user_id);')
91+
92+
# Commit changes and close connection
93+
conn.commit()
94+
conn.close()
95+
96+
def sync(self, client, project_id):
97+
project_info = client.get_project_info(project_id)
98+
99+
labels = client.get_labels(project_id)
100+
labels_df = pd.DataFrame(labels)
101+
labels_df['definition'] = labels_df['definition'].apply(json.dumps)
28102

103+
n_articles = project_info['result']['project']['stats']['articles']
104+
articles = [resp for resp in tqdm.tqdm(client.fetch_all_articles(project_id), total=n_articles)]
105+
106+
article_labels = [a['labels'] for a in articles if a['labels'] is not None]
107+
article_labels = [lbl for lbls in article_labels for lbl in lbls]
108+
article_label_df = pd.DataFrame(article_labels)
109+
110+
article_data = [{k: v for k, v in a.items() if k != 'labels'} for a in articles]
111+
article_data_df = pd.DataFrame(article_data)
112+
article_data_df['resolve'] = article_data_df['resolve'].apply(json.dumps)
113+
114+
# write everything to .sr/sr.sqlite
115+
conn = sqlite3.connect('.sr/sr.sqlite')
116+
117+
# Writing data to tables
118+
labels_df.to_sql('labels', conn, if_exists='replace', index=False)
119+
article_label_df.to_sql('article_label', conn, if_exists='replace', index=False)
120+
article_data_df.to_sql('article_data', conn, if_exists='replace', index=False)
121+
122+
conn.close()
29123
class Client():
30124

31125
def __init__(self, api_key, base_url="https://www.sysrev.com"):
32126
self.api_key = api_key
33127
self.base_url = base_url
128+
129+
def sync(self, project_id):
130+
Synchronizer().sync(self, project_id)
34131

35132
def get_project_info(self, project_id):
36133
endpoint = f"{self.base_url}/api-json/project-info"
37134
headers = {"Authorization": f"Bearer {self.api_key}"}
38135
response = requests.get(endpoint, headers=headers, params={"project-id": project_id})
39136
return response.json()
40-
137+
138+
def get_labels(self, project_id):
139+
raw_labels = self.get_project_info(project_id)['result']['project']['labels']
140+
labels = [{"label_id": label_id} | raw_labels[label_id] for label_id in raw_labels.keys()]
141+
return labels
142+
41143
def set_labels(self, project_id, article_id, label_ids, label_values, label_types, confirm=False, change=False, resolve=False):
42144
endpoint = f"{self.base_url}/api-json/set-labels"
43145
headers = {"Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json"}

0 commit comments

Comments
 (0)