diff --git a/iis-wf/iis-wf-referenceextraction/src/main/resources/eu/dnetlib/iis/wf/referenceextraction/project/main_sqlite/oozie_app/lib/scripts/projects.sql b/iis-wf/iis-wf-referenceextraction/src/main/resources/eu/dnetlib/iis/wf/referenceextraction/project/main_sqlite/oozie_app/lib/scripts/projects.sql index 829eca2e0..ab709e71e 100644 --- a/iis-wf/iis-wf-referenceextraction/src/main/resources/eu/dnetlib/iis/wf/referenceextraction/project/main_sqlite/oozie_app/lib/scripts/projects.sql +++ b/iis-wf/iis-wf-referenceextraction/src/main/resources/eu/dnetlib/iis/wf/referenceextraction/project/main_sqlite/oozie_app/lib/scripts/projects.sql @@ -12,6 +12,8 @@ hidden var 'nihposshort' from select jmergeregexp(jgroup(word)) from (select * f hidden var 'nihposfull' from select jmergeregexp(jgroup(word)) from (select * from nihposnamesfull order by length(word) desc); hidden var 'nihpositives' from select jmergeregexp(jgroup(word)) from (select * from nihpositives order by length(word) desc); hidden var 'nihnegatives' from select jmergeregexp(jgroup(word)) from (select * from nihnegatives order by length(word) desc); +hidden var 'hfripos' from select "(?:innovation project)|(?:multigold numbered)|(?:fellowship number)|(?:grant fellowship)|(?:innovation grant)|(?:scholarship code)|(?:technology gsrt)|(?:project number)|(?:faculty grant)|(?:hfri project)|(?:elidek grant)|(?:agreement no)|(?:funded grant)|(?:project no)|(?:hfri grant)|(?:gsrt grant)|(?:hfri fm17)|(?:hfri code)|(?:grant no)|(?:grant ga)|(?:ga hfri)"; +hidden var 'hfrineg' from select "(?:\bekt\b)|(?:eliamep)|(?:\bforth\b)|(?:\bi.k.a.\b)|(?:\bipep\b)"; hidden var 'miur_unidentified' from select id from grants where fundingclass1="MIUR" and grantid="unidentified" limit 1; hidden var 'wt_unidentified' from select id from grants where fundingclass1="WT" and grantid="unidentified" limit 1; hidden var 'gsri_unidentified' from select id from grants where fundingclass1="GSRI" and grantid="unidentified" limit 1; @@ -20,16 +22,33 @@ hidden var 'nserc_unidentified' from (select id from grants where fundingclass1= hidden var 'sshrc_unidentified' from (select id from grants where fundingclass1="SSHRC" and grantid="unidentified" limit 1); hidden var 'nrc_unidentified' from (select id from grants where fundingclass1="NRC" and grantid="unidentified" limit 1); hidden var 'inca_unidentified' from (select id from grants where fundingclass1="INCa" and grantid="unidentified" limit 1); +hidden var 'hfri_unidentified' from (select id from grants where fundingclass1="HFRI" and grantid="unidentified" limit 1); -create temp table pubs as setschema 'c1,c2' select jsonpath(c1, '$.id', '$.text') from stdinput(); +create temp table pubs as setschema 'c1,c2' select jsonpath(c1, '$.id', '$.text') from stdinput(); create temp table incaprojects as select id, grantid, gid, jmergeregexp(terms) as terms, jlen(terms) as lt from (select id, grantid, s2j(keywords(regexpr("\d", terms, ""))) as terms, regexpr("\s+",gid,"\s*") as gid from (select id, grantid, regexpr("(\D+)",grantid) as terms, regexpr("\D([\d|\s]+)$",grantid) as gid from grants where fundingclass1 = "INCa" and gid is not null)); - +create temp table hfri_unidentified_only as select docid, var('hfri_unidentified') as id, prev, middle, next from (setschema 'docid,prev,middle,next' + select c1 as docid, textwindow2s(filterstopwords(lower(c2)), 10,3,10, "\bhfri\b|h\.f\.r\.i\.|hellenic foundation research|greek foundation research|ελιδεκ|ελληνικο ιδρυμα ερευνας|ελληνικό ίδρυμα έρευνας|elidek") from ((setschema 'c1,c2' select * from pubs where c2 is not null))) + where var('hfri_unidentified') and lower(j2s(prev,middle,next)) not like "%himalayan%" and not regexprmatches(var('hfrineg'), lower(j2s(prev,middle,next))) and regexprmatches("gsrt|greek|greece|hellenic",lower(j2s(prev,middle,next))); + + +create temp table output_hfri as +select jdict('documentId', docid, 'projectId', id, 'confidenceLevel', 0.8, 'textsnippet', textsnippet_) as C1, docid, id from ( + select docid, id, textsnippet_ from + (setschema 'docid,textsnippet_,textwin1,textwin2,textwin3, proj_id, id' select docid, textsnippet_, textwindow2s(textsnippet_,2,1,1,"(?:\b|fm17)"||proj_id||"\b") as textwin, proj_id, id from ( + select docid,t2.Project_id as proj_id, t2.id, t1.textsnippet_ from + ( setschema 'docid, textsnippet_, res' select distinct docid, textsnippet_, jsplitv(regexprfindall('(?:\b|fm17)(\d{2,4})\b',textsnippet_)) as res + from ( select docid, filterstopwords(lower(keywords(j2s(prev,middle,next)))) as textsnippet_ from (setschema 'docid, id, prev, middle, next' select * from hfri_unidentified_only)) ) as t1, + (select id, grantid as project_id from grants where fundingclass1 = "HFRI") as t2 where res = t2.project_id and t2.project_id is not null + ) + ) where (regexprmatches(var('hfripos'),textwin1) or textwin2 like "%"||"fm17"||proj_id||"%" or textwin2 = "."||proj_id) and textwin1 is not null and textwin2 is not null + and textwin3 is not "hellenic" group by docid, id); + create temp table matched_undefined_miur_only as select distinct docid, var('miur_unidentified') as id, prev,middle,next from (setschema 'docid,prev,middle,next' @@ -37,6 +56,9 @@ select c1 as docid, textwindow2s(c2,10,1,10, '\b(?:RBSI\d{2}\w{4})\b') from (set where var('miur_unidentified') and (regexprmatches('\b(?:RBSI\d{2}\w{4})\b', middle)); + + + create temp table matched_undefined_inca_only as select distinct docid, var('inca_unidentified') as id, prev,middle,next from (setschema 'docid,prev,middle,next' select c1 as docid, textwindow2s(c2,15,4,10, '\bINCa|French National Cancer Institute') from (setschema 'c1,c2' select * from pubs where c2 is not null)) where var('inca_unidentified') @@ -318,6 +340,7 @@ delete from output_table where fundingClass1="CHIST-ERA" and grantid="unidentifi delete from matched_undefined_miur_only where docid in (select docid from output_table where fundingClass1="MIUR"); delete from matched_undefined_wt_only where docid in (select docid from output_table where fundingClass1="WT"); delete from matched_undefined_gsri where docid in (select docid from output_table where fundingClass1="GSRI"); +delete from hfri_unidentified_only where docid in (select docid from output_hfri); delete from output_table where j2s(docid,id) in (select j2s(T.docid, T.id) from output_table S, output_table T where S.docid = T.docid and S.id in (select id from grants where grantid in (select * from gold)) and T.id in (select id from grants where grantid in ("246686", "283595","643410"))); delete from output_table where fundingclass1 = "EC" and j2s(docid, grantid) in (select j2s(docid, grantid) from output_table where fundingclass1 = "RCN"); @@ -339,10 +362,14 @@ select C1 from output_table union all select C1 from secondary_output_table union all +select C1 from output_hfri +union all select jdict('documentId', docid, 'projectId', id, 'confidenceLevel', 0.8, 'textsnippet', prev||" "||middle||" "||next) from matched_undefined_miur_only union all select jdict('documentId', docid, 'projectId', id, 'confidenceLevel', 0.8, 'textsnippet', prev||" "||middle||" "||next) from matched_undefined_wt_only union all select jdict('documentId', docid, 'projectId', id, 'confidenceLevel', 0.8, 'textsnippet', prev||" "||middle||" "||next) from matched_undefined_inca_only union all -select jdict('documentId', docid, 'projectId', id, 'confidenceLevel', 0.8, 'textsnippet', prev||" "||middle||" "||next) from matched_undefined_gsri; +select jdict('documentId', docid, 'projectId', id, 'confidenceLevel', 0.8, 'textsnippet', prev||" "||middle||" "||next) from matched_undefined_gsri +union all +select jdict('documentId', docid, 'projectId', id, 'confidenceLevel', 0.8, 'textsnippet', prev||" << "||middle||" >> "||next) from (select * from hfri_unidentified_only group by docid); \ No newline at end of file diff --git a/iis-wf/iis-wf-referenceextraction/src/test/resources/eu/dnetlib/iis/wf/referenceextraction/project/data/document_text.json b/iis-wf/iis-wf-referenceextraction/src/test/resources/eu/dnetlib/iis/wf/referenceextraction/project/data/document_text.json index d2a9a148f..eb77c950c 100644 --- a/iis-wf/iis-wf-referenceextraction/src/test/resources/eu/dnetlib/iis/wf/referenceextraction/project/data/document_text.json +++ b/iis-wf/iis-wf-referenceextraction/src/test/resources/eu/dnetlib/iis/wf/referenceextraction/project/data/document_text.json @@ -56,3 +56,4 @@ {"text":"Funding sources/sponsors: JL is funded by a National Health and Medical Research Council of Australia Partnership Project Grant (1056888).", "id":"50|sharebioRxiv::20773c458e7814d5b47c6c498b775d2b"} {"text":"This material is based upon work supported by the National Science Foundation under Grant No. ATM-0513463.", "id":"50|od_______212::0f31511cdbd148bf5446b52f49ba8544"} {"text":"Acknowledgements This work was partially supported by Science Foundation Ireland Grant 04/IN1/I478 and Science Foundation Ireland Grant 03/RPT1/I382.", "id":"50|doi_________::9c4ddd5d830294ab76d7e7919a379f3b"} +{"text":"Acknowledgments: This work was supported by the Hellenic Foundation for Research and Innovation (HFRI - Project No: 789)", "id":"50|arXiv_______::54002047659adf031293eabfbfe9938b"} diff --git a/iis-wf/iis-wf-referenceextraction/src/test/resources/eu/dnetlib/iis/wf/referenceextraction/project/data/document_to_project.json b/iis-wf/iis-wf-referenceextraction/src/test/resources/eu/dnetlib/iis/wf/referenceextraction/project/data/document_to_project.json index 8acc6842e..206bfaf25 100644 --- a/iis-wf/iis-wf-referenceextraction/src/test/resources/eu/dnetlib/iis/wf/referenceextraction/project/data/document_to_project.json +++ b/iis-wf/iis-wf-referenceextraction/src/test/resources/eu/dnetlib/iis/wf/referenceextraction/project/data/document_to_project.json @@ -448,4 +448,10 @@ "projectId": "40|sfi_________::05395fd69f5aa3ba5e9e75dcf527d8ac", "confidenceLevel": 0.8, "textsnippet": "Acknowledgements This work was partially supported by Science Foundation Ireland Grant <<< 04/IN1/I478 >>> and Science Foundation Ireland Grant" +} +{ + "documentId": "50|arXiv_______::54002047659adf031293eabfbfe9938b", + "projectId": "40|hfri________::644d89adeca811786cf72d7967ec9813", + "confidenceLevel": 0.8, + "textsnippet": "acknowledgments work supported hellenic foundation research innovation hfri project 789" } \ No newline at end of file diff --git a/iis-wf/iis-wf-referenceextraction/src/test/resources/eu/dnetlib/iis/wf/referenceextraction/project/data/project.json b/iis-wf/iis-wf-referenceextraction/src/test/resources/eu/dnetlib/iis/wf/referenceextraction/project/data/project.json index 0996fe427..17f8a0905 100644 --- a/iis-wf/iis-wf-referenceextraction/src/test/resources/eu/dnetlib/iis/wf/referenceextraction/project/data/project.json +++ b/iis-wf/iis-wf-referenceextraction/src/test/resources/eu/dnetlib/iis/wf/referenceextraction/project/data/project.json @@ -72,3 +72,5 @@ {"id": "40|irb_hr______::0a480b267a9c5f652a8cc607bda9fe1c", "projectGrantId": "053-0532265-2255", "projectAcronym": null, "fundingClass": "MZOS::", "jsonextrainfo": "{}"} {"id": "40|irb_hr______::37ca9ece55928656726557c7c0a36a1a", "projectGrantId": "IP-2013-11-1021", "projectAcronym": null, "fundingClass": "HRZZ::", "jsonextrainfo": "{}"} {"id": "40|nhmrc_______::019492919738381cbee98a17ae1dae25", "projectGrantId": "1056888", "projectAcronym": null, "fundingClass": "NHMRC::NHMRC Partnerships", "jsonextrainfo": "{}"} +{"id": "40|hfri________::644d89adeca811786cf72d7967ec9813", "projectGrantId": "789", "projectAcronym": null, "fundingClass": "HFRI::", "jsonextrainfo": "{}"} +{"id": "40|hfri________::cb5d92ce46b051859d1d9655e0ae7b46", "projectGrantId": "unidentified", "projectAcronym": null, "fundingClass": "HFRI::", "jsonextrainfo": "{}"} \ No newline at end of file diff --git a/iis-wf/iis-wf-referenceextraction/src/test/resources/eu/dnetlib/iis/wf/referenceextraction/project/data/report_funder.json b/iis-wf/iis-wf-referenceextraction/src/test/resources/eu/dnetlib/iis/wf/referenceextraction/project/data/report_funder.json index eebd358b0..c2fa14d6b 100644 --- a/iis-wf/iis-wf-referenceextraction/src/test/resources/eu/dnetlib/iis/wf/referenceextraction/project/data/report_funder.json +++ b/iis-wf/iis-wf-referenceextraction/src/test/resources/eu/dnetlib/iis/wf/referenceextraction/project/data/report_funder.json @@ -53,6 +53,11 @@ "type": "COUNTER", "value": "2" } +{ + "key": "processing.referenceExtraction.project.references.byfunder.hfri", + "type": "COUNTER", + "value": "1" +} { "key": "processing.referenceExtraction.project.references.byfunder.hrzz", "type": "COUNTER", @@ -171,5 +176,5 @@ { "key": "processing.referenceExtraction.project.references.total", "type": "COUNTER", - "value": "75" + "value": "76" } \ No newline at end of file