Skip to content

Commit

Permalink
Closes #1420: Integrate the HFRI project mining
Browse files Browse the repository at this point in the history
Supplementing integration tests suite with HFRI projects mining case.

This squashed commit includes the following set of projects.sql madis script related changes:
* solve 0 results for HFRI, (move group by at the end after the post-processing step)
* resolve false positives hfri
* change positives for HFRI
  • Loading branch information
marekhorst committed Oct 30, 2023
1 parent db5389d commit d10941d
Show file tree
Hide file tree
Showing 5 changed files with 45 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ hidden var 'nihposshort' from select jmergeregexp(jgroup(word)) from (select * f
hidden var 'nihposfull' from select jmergeregexp(jgroup(word)) from (select * from nihposnamesfull order by length(word) desc);
hidden var 'nihpositives' from select jmergeregexp(jgroup(word)) from (select * from nihpositives order by length(word) desc);
hidden var 'nihnegatives' from select jmergeregexp(jgroup(word)) from (select * from nihnegatives order by length(word) desc);
hidden var 'hfripos' from select "(?:innovation project)|(?:multigold numbered)|(?:fellowship number)|(?:grant fellowship)|(?:innovation grant)|(?:scholarship code)|(?:technology gsrt)|(?:project number)|(?:faculty grant)|(?:hfri project)|(?:elidek grant)|(?:agreement no)|(?:funded grant)|(?:project no)|(?:hfri grant)|(?:gsrt grant)|(?:hfri fm17)|(?:hfri code)|(?:grant no)|(?:grant ga)|(?:ga hfri)";
hidden var 'hfrineg' from select "(?:\bekt\b)|(?:eliamep)|(?:\bforth\b)|(?:\bi.k.a.\b)|(?:\bipep\b)";
hidden var 'miur_unidentified' from select id from grants where fundingclass1="MIUR" and grantid="unidentified" limit 1;
hidden var 'wt_unidentified' from select id from grants where fundingclass1="WT" and grantid="unidentified" limit 1;
hidden var 'gsri_unidentified' from select id from grants where fundingclass1="GSRI" and grantid="unidentified" limit 1;
Expand All @@ -20,23 +22,43 @@ hidden var 'nserc_unidentified' from (select id from grants where fundingclass1=
hidden var 'sshrc_unidentified' from (select id from grants where fundingclass1="SSHRC" and grantid="unidentified" limit 1);
hidden var 'nrc_unidentified' from (select id from grants where fundingclass1="NRC" and grantid="unidentified" limit 1);
hidden var 'inca_unidentified' from (select id from grants where fundingclass1="INCa" and grantid="unidentified" limit 1);
hidden var 'hfri_unidentified' from (select id from grants where fundingclass1="HFRI" and grantid="unidentified" limit 1);

create temp table pubs as setschema 'c1,c2' select jsonpath(c1, '$.id', '$.text') from stdinput();
create temp table pubs as setschema 'c1,c2' select jsonpath(c1, '$.id', '$.text') from stdinput();

create temp table incaprojects as
select id, grantid, gid, jmergeregexp(terms) as terms, jlen(terms) as lt from
(select id, grantid, s2j(keywords(regexpr("\d", terms, ""))) as terms, regexpr("\s+",gid,"\s*") as gid from
(select id, grantid, regexpr("(\D+)",grantid) as terms, regexpr("\D([\d|\s]+)$",grantid) as gid from grants
where fundingclass1 = "INCa" and gid is not null));


create temp table hfri_unidentified_only as select docid, var('hfri_unidentified') as id, prev, middle, next from (setschema 'docid,prev,middle,next'
select c1 as docid, textwindow2s(filterstopwords(lower(c2)), 10,3,10, "\bhfri\b|h\.f\.r\.i\.|hellenic foundation research|greek foundation research|ελιδεκ|ελληνικο ιδρυμα ερευνας|ελληνικό ίδρυμα έρευνας|elidek") from ((setschema 'c1,c2' select * from pubs where c2 is not null)))
where var('hfri_unidentified') and lower(j2s(prev,middle,next)) not like "%himalayan%" and not regexprmatches(var('hfrineg'), lower(j2s(prev,middle,next))) and regexprmatches("gsrt|greek|greece|hellenic",lower(j2s(prev,middle,next)));


create temp table output_hfri as
select jdict('documentId', docid, 'projectId', id, 'confidenceLevel', 0.8, 'textsnippet', textsnippet_) as C1, docid, id from (
select docid, id, textsnippet_ from
(setschema 'docid,textsnippet_,textwin1,textwin2,textwin3, proj_id, id' select docid, textsnippet_, textwindow2s(textsnippet_,2,1,1,"(?:\b|fm17)"||proj_id||"\b") as textwin, proj_id, id from (
select docid,t2.Project_id as proj_id, t2.id, t1.textsnippet_ from
( setschema 'docid, textsnippet_, res' select distinct docid, textsnippet_, jsplitv(regexprfindall('(?:\b|fm17)(\d{2,4})\b',textsnippet_)) as res
from ( select docid, filterstopwords(lower(keywords(j2s(prev,middle,next)))) as textsnippet_ from (setschema 'docid, id, prev, middle, next' select * from hfri_unidentified_only)) ) as t1,
(select id, grantid as project_id from grants where fundingclass1 = "HFRI") as t2 where res = t2.project_id and t2.project_id is not null
)
) where (regexprmatches(var('hfripos'),textwin1) or textwin2 like "%"||"fm17"||proj_id||"%" or textwin2 = "."||proj_id) and textwin1 is not null and textwin2 is not null
and textwin3 is not "hellenic" group by docid, id);



create temp table matched_undefined_miur_only as select distinct docid, var('miur_unidentified') as id, prev,middle,next from (setschema 'docid,prev,middle,next'
select c1 as docid, textwindow2s(c2,10,1,10, '\b(?:RBSI\d{2}\w{4})\b') from (setschema 'c1,c2' select * from pubs where c2 is not null))
where var('miur_unidentified') and (regexprmatches('\b(?:RBSI\d{2}\w{4})\b', middle));





create temp table matched_undefined_inca_only as select distinct docid, var('inca_unidentified') as id, prev,middle,next from (setschema 'docid,prev,middle,next'
select c1 as docid, textwindow2s(c2,15,4,10, '\bINCa|French National Cancer Institute') from (setschema 'c1,c2' select * from pubs where c2 is not null))
where var('inca_unidentified')
Expand Down Expand Up @@ -318,6 +340,7 @@ delete from output_table where fundingClass1="CHIST-ERA" and grantid="unidentifi
delete from matched_undefined_miur_only where docid in (select docid from output_table where fundingClass1="MIUR");
delete from matched_undefined_wt_only where docid in (select docid from output_table where fundingClass1="WT");
delete from matched_undefined_gsri where docid in (select docid from output_table where fundingClass1="GSRI");
delete from hfri_unidentified_only where docid in (select docid from output_hfri);

delete from output_table where j2s(docid,id) in (select j2s(T.docid, T.id) from output_table S, output_table T where S.docid = T.docid and S.id in (select id from grants where grantid in (select * from gold)) and T.id in (select id from grants where grantid in ("246686", "283595","643410")));
delete from output_table where fundingclass1 = "EC" and j2s(docid, grantid) in (select j2s(docid, grantid) from output_table where fundingclass1 = "RCN");
Expand All @@ -339,10 +362,14 @@ select C1 from output_table
union all
select C1 from secondary_output_table
union all
select C1 from output_hfri
union all
select jdict('documentId', docid, 'projectId', id, 'confidenceLevel', 0.8, 'textsnippet', prev||" "||middle||" "||next) from matched_undefined_miur_only
union all
select jdict('documentId', docid, 'projectId', id, 'confidenceLevel', 0.8, 'textsnippet', prev||" "||middle||" "||next) from matched_undefined_wt_only
union all
select jdict('documentId', docid, 'projectId', id, 'confidenceLevel', 0.8, 'textsnippet', prev||" "||middle||" "||next) from matched_undefined_inca_only
union all
select jdict('documentId', docid, 'projectId', id, 'confidenceLevel', 0.8, 'textsnippet', prev||" "||middle||" "||next) from matched_undefined_gsri;
select jdict('documentId', docid, 'projectId', id, 'confidenceLevel', 0.8, 'textsnippet', prev||" "||middle||" "||next) from matched_undefined_gsri
union all
select jdict('documentId', docid, 'projectId', id, 'confidenceLevel', 0.8, 'textsnippet', prev||" << "||middle||" >> "||next) from (select * from hfri_unidentified_only group by docid);
Original file line number Diff line number Diff line change
Expand Up @@ -56,3 +56,4 @@
{"text":"Funding sources/sponsors: JL is funded by a National Health and Medical Research Council of Australia Partnership Project Grant (1056888).", "id":"50|sharebioRxiv::20773c458e7814d5b47c6c498b775d2b"}
{"text":"This material is based upon work supported by the National Science Foundation under Grant No. ATM-0513463.", "id":"50|od_______212::0f31511cdbd148bf5446b52f49ba8544"}
{"text":"Acknowledgements This work was partially supported by Science Foundation Ireland Grant 04/IN1/I478 and Science Foundation Ireland Grant 03/RPT1/I382.", "id":"50|doi_________::9c4ddd5d830294ab76d7e7919a379f3b"}
{"text":"Acknowledgments: This work was supported by the Hellenic Foundation for Research and Innovation (HFRI - Project No: 789)", "id":"50|arXiv_______::54002047659adf031293eabfbfe9938b"}
Original file line number Diff line number Diff line change
Expand Up @@ -448,4 +448,10 @@
"projectId": "40|sfi_________::05395fd69f5aa3ba5e9e75dcf527d8ac",
"confidenceLevel": 0.8,
"textsnippet": "Acknowledgements This work was partially supported by Science Foundation Ireland Grant <<< 04/IN1/I478 >>> and Science Foundation Ireland Grant"
}
{
"documentId": "50|arXiv_______::54002047659adf031293eabfbfe9938b",
"projectId": "40|hfri________::644d89adeca811786cf72d7967ec9813",
"confidenceLevel": 0.8,
"textsnippet": "acknowledgments work supported hellenic foundation research innovation hfri project 789"
}
Original file line number Diff line number Diff line change
Expand Up @@ -72,3 +72,5 @@
{"id": "40|irb_hr______::0a480b267a9c5f652a8cc607bda9fe1c", "projectGrantId": "053-0532265-2255", "projectAcronym": null, "fundingClass": "MZOS::", "jsonextrainfo": "{}"}
{"id": "40|irb_hr______::37ca9ece55928656726557c7c0a36a1a", "projectGrantId": "IP-2013-11-1021", "projectAcronym": null, "fundingClass": "HRZZ::", "jsonextrainfo": "{}"}
{"id": "40|nhmrc_______::019492919738381cbee98a17ae1dae25", "projectGrantId": "1056888", "projectAcronym": null, "fundingClass": "NHMRC::NHMRC Partnerships", "jsonextrainfo": "{}"}
{"id": "40|hfri________::644d89adeca811786cf72d7967ec9813", "projectGrantId": "789", "projectAcronym": null, "fundingClass": "HFRI::", "jsonextrainfo": "{}"}
{"id": "40|hfri________::cb5d92ce46b051859d1d9655e0ae7b46", "projectGrantId": "unidentified", "projectAcronym": null, "fundingClass": "HFRI::", "jsonextrainfo": "{}"}
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,11 @@
"type": "COUNTER",
"value": "2"
}
{
"key": "processing.referenceExtraction.project.references.byfunder.hfri",
"type": "COUNTER",
"value": "1"
}
{
"key": "processing.referenceExtraction.project.references.byfunder.hrzz",
"type": "COUNTER",
Expand Down Expand Up @@ -171,5 +176,5 @@
{
"key": "processing.referenceExtraction.project.references.total",
"type": "COUNTER",
"value": "75"
"value": "76"
}

0 comments on commit d10941d

Please sign in to comment.