-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrepo_intent.py
66 lines (64 loc) · 3.01 KB
/
repo_intent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import os
import pandas as pd
import argparse
def main(filepath, true_list_path, false_list_path, outdir):
# read ePrints data
df = pd.read_csv(filepath)
# filter to those we have manually collected intent labels for
df = df[df.page_no < 2]
# for each GitHub repository cited in a pubcliation, find out whether it was mentioned as created software
created_map = {"github_repo_id": [], "mention_created": []}
for path, v in [(true_list_path, True), (false_list_path, False)]:
with open(path, "r") as f:
for line in f.readlines():
created_map["github_repo_id"].append(line.rstrip("\n"))
created_map["mention_created"].append(v)
created_df = pd.DataFrame(created_map)
# construct the final dataset
merged_df = created_df.merge(df, how="left", left_on="github_repo_id", right_on="github_user_cleaned_url")
print(f"[INFO] Candidates: {len(df)}\n Mapped: {len(created_df)}\n Result: {len(merged_df)}")
merged_df["date"] = pd.to_datetime(merged_df["date"])
merged_df = merged_df.astype({
"github_repo_id": str,
"mention_created": bool,
"title": str,
"author_for_reference": str,
"pdf_url": str,
"page_no": int,
"domain_url": str,
"pattern_cleaned_url": str,
"github_user_cleaned_url": str,
"year": int,
"eprints_repo": str
})
# correct duplicates manually
corrected_cnt = 0
idx = merged_df[(merged_df.github_repo_id == "IraKorshunova/folk-rnn") & (merged_df.pdf_url == "https://eprints.kingston.ac.uk/id/eprint/44298/1/Ben-Tal-O-44298-VoR.pdf")].index[0]
merged_df.loc[idx, "mention_created"] = False
corrected_cnt += 1
print(f"[INFO] Manual corrections: {corrected_cnt}")
# rename columns for clarity
merged_df.rename(columns={
"title": "pub_title",
"author_for_reference": "pub_author_for_reference",
"domain_url": "detected_github_url",
"pattern_cleaned_url": "pattern_matched_github_url",
"date": "eprints_date",
"year": "eprints_pub_year",
}, inplace=True)
merged_df.drop(columns=["github_user_cleaned_url"], inplace=True)
print("Schema:")
merged_df.info()
merged_df.to_csv(os.path.join(outdir, "eprints_w_intent.csv"))
if __name__ == "__main__":
parser = argparse.ArgumentParser(
prog="repo_intent",
description="Add repository intent to the ePrints publication data."
)
parser.add_argument("-f", "--file", required=True, type=str, help="data file containing info extracted from all ePrints repositories")
parser.add_argument("--true", required=True, type=str, help="list of repo links cited as created")
parser.add_argument("--false", required=True, type=str, help="list of repo links that are just mentioned")
parser.add_argument("--outdir", default="../../data/derived", type=str, help="path to derived data directory")
parser.add_argument
args = parser.parse_args()
main(args.file, args.true, args.false, args.outdir)