1
- import requests
1
+ import requests , sqlite3 , pathlib , pandas as pd , json , tqdm
2
2
3
3
class LabelTransformer :
4
4
@@ -25,19 +25,121 @@ def transform_label(self, label_type, label_value):
25
25
return self .handle_categorical_or_string (label_value )
26
26
else :
27
27
raise ValueError ("Invalid label type" )
28
+
29
+ class Synchronizer :
30
+
31
+ def create_sqlite_db ():
32
+ pathlib .Path (".sr" ).mkdir (exist_ok = True )
33
+ conn = sqlite3 .connect ('.sr/sr.sqlite' )
34
+ c = conn .cursor ()
35
+
36
+ # Create article_data table first
37
+ c .execute ('''
38
+ CREATE TABLE IF NOT EXISTS article_data (
39
+ primary_title TEXT,
40
+ consensus INTEGER,
41
+ article_id TEXT PRIMARY KEY,
42
+ updated_time TEXT,
43
+ notes TEXT,
44
+ resolve INTEGER
45
+ );
46
+ ''' )
47
+
48
+ # Create labels table
49
+ c .execute ('''
50
+ CREATE TABLE IF NOT EXISTS labels (
51
+ label_id INTEGER PRIMARY KEY,
52
+ label_id_local TEXT,
53
+ category TEXT,
54
+ definition TEXT,
55
+ name TEXT,
56
+ consensus INTEGER,
57
+ question TEXT,
58
+ project_ordering INTEGER,
59
+ short_label TEXT,
60
+ label_id_global TEXT,
61
+ root_label_id_local TEXT,
62
+ global_label_id TEXT,
63
+ project_id INTEGER,
64
+ enabled INTEGER,
65
+ value_type TEXT,
66
+ required INTEGER,
67
+ owner_project_id INTEGER
68
+ );
69
+ ''' )
70
+
71
+ # Create article_label table with foreign key references to both labels and article_data
72
+ c .execute ('''
73
+ CREATE TABLE IF NOT EXISTS article_label (
74
+ article_id TEXT,
75
+ label_id INTEGER,
76
+ user_id INTEGER,
77
+ answer TEXT,
78
+ inclusion INTEGER,
79
+ updated_time TEXT,
80
+ confirm_time TEXT,
81
+ resolve INTEGER,
82
+ PRIMARY KEY (article_id, label_id),
83
+ FOREIGN KEY (label_id) REFERENCES labels (label_id),
84
+ FOREIGN KEY (article_id) REFERENCES article_data (article_id)
85
+ );
86
+ ''' )
87
+
88
+ # Indexes for improved query performance
89
+ c .execute ('CREATE INDEX IF NOT EXISTS idx_labels_project_id ON labels (project_id);' )
90
+ c .execute ('CREATE INDEX IF NOT EXISTS idx_article_label_user_id ON article_label (user_id);' )
91
+
92
+ # Commit changes and close connection
93
+ conn .commit ()
94
+ conn .close ()
95
+
96
+ def sync (self , client , project_id ):
97
+ project_info = client .get_project_info (project_id )
98
+
99
+ labels = client .get_labels (project_id )
100
+ labels_df = pd .DataFrame (labels )
101
+ labels_df ['definition' ] = labels_df ['definition' ].apply (json .dumps )
28
102
103
+ n_articles = project_info ['result' ]['project' ]['stats' ]['articles' ]
104
+ articles = [resp for resp in tqdm .tqdm (client .fetch_all_articles (project_id ), total = n_articles )]
105
+
106
+ article_labels = [a ['labels' ] for a in articles if a ['labels' ] is not None ]
107
+ article_labels = [lbl for lbls in article_labels for lbl in lbls ]
108
+ article_label_df = pd .DataFrame (article_labels )
109
+
110
+ article_data = [{k : v for k , v in a .items () if k != 'labels' } for a in articles ]
111
+ article_data_df = pd .DataFrame (article_data )
112
+ article_data_df ['resolve' ] = article_data_df ['resolve' ].apply (json .dumps )
113
+
114
+ # write everything to .sr/sr.sqlite
115
+ conn = sqlite3 .connect ('.sr/sr.sqlite' )
116
+
117
+ # Writing data to tables
118
+ labels_df .to_sql ('labels' , conn , if_exists = 'replace' , index = False )
119
+ article_label_df .to_sql ('article_label' , conn , if_exists = 'replace' , index = False )
120
+ article_data_df .to_sql ('article_data' , conn , if_exists = 'replace' , index = False )
121
+
122
+ conn .close ()
29
123
class Client ():
30
124
31
125
def __init__ (self , api_key , base_url = "https://www.sysrev.com" ):
32
126
self .api_key = api_key
33
127
self .base_url = base_url
128
+
129
+ def sync (self , project_id ):
130
+ Synchronizer ().sync (self , project_id )
34
131
35
132
def get_project_info (self , project_id ):
36
133
endpoint = f"{ self .base_url } /api-json/project-info"
37
134
headers = {"Authorization" : f"Bearer { self .api_key } " }
38
135
response = requests .get (endpoint , headers = headers , params = {"project-id" : project_id })
39
136
return response .json ()
40
-
137
+
138
+ def get_labels (self , project_id ):
139
+ raw_labels = self .get_project_info (project_id )['result' ]['project' ]['labels' ]
140
+ labels = [{"label_id" : label_id } | raw_labels [label_id ] for label_id in raw_labels .keys ()]
141
+ return labels
142
+
41
143
def set_labels (self , project_id , article_id , label_ids , label_values , label_types , confirm = False , change = False , resolve = False ):
42
144
endpoint = f"{ self .base_url } /api-json/set-labels"
43
145
headers = {"Authorization" : f"Bearer { self .api_key } " , "Content-Type" : "application/json" }
0 commit comments