-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1293 from PathwayCommons/iss1148_testing-hints-ma…
…pper Testing organism hints mapper for BioC documents.
- Loading branch information
Showing
10 changed files
with
3,390 additions
and
3,281 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,85 +1,84 @@ | ||
import _ from 'lodash'; | ||
|
||
// Define constants for Hint types, combining ORGANISM with ENTITY_TYPE | ||
const HINT_TYPE = Object.freeze({ | ||
ORGANISM: 'organism' | ||
ORGANISM: 'organism', | ||
}); | ||
|
||
// Flatten the HINT_TYPE object to create an array of all hint types | ||
const HINT_TYPES = _.flatMap(HINT_TYPE); | ||
|
||
const PASSAGE_TYPE = Object.freeze({ | ||
TITLE: 'title', | ||
ABSTRACT: 'abstract' | ||
// Define constants for sections of a document | ||
const SECTION = Object.freeze({ | ||
TITLE: 'title', | ||
ABSTRACT: 'abstract', | ||
}); | ||
const PASSAGE_TYPES = _.flatMap(PASSAGE_TYPE); | ||
|
||
/* | ||
* Class representing a Hint. | ||
* A hint is a piece of information that is extracted from sections of articles, such as the title or abstract. | ||
* It can be about the organism being studied in the article or a gene name under investigation. | ||
*/ | ||
class Hint{ | ||
// Flatten the SECTION object to create an array of all sections | ||
const SECTIONS = _.flatMap(SECTION); | ||
|
||
/** | ||
* Create a Hint. | ||
* @param {string} text - The hint text. | ||
* @param {string} type - The hint type. | ||
* @param {Object} xref - The hint xref. | ||
* @param {string} section - The hint section. | ||
*/ | ||
constructor(text, type, xref, section){ | ||
this._text = text; | ||
this._type = type; | ||
this._xref = xref; | ||
this._section = section; | ||
} | ||
/** | ||
* Representing a bioentity mention and ground | ||
*/ | ||
class Hint { | ||
/** | ||
* Creates an instance of Hint. | ||
* @param {Array} param.texts - The texts associated with the hint. | ||
* @param {string} param.type - The type of the hint. | ||
* @param {Object} param.xref - The cross-reference (xref) object. | ||
* @param {string} param.section - The section of the document where the hint was found. | ||
*/ | ||
constructor(texts, type, xref, section) { | ||
// Initialize the properties using the setters to enforce validation | ||
this._texts = texts; | ||
this._type = type; | ||
this._xref = xref; | ||
this._section = section; | ||
} | ||
|
||
/** | ||
* Get the hint text. | ||
* The text is a single mention extracted from the article section. | ||
* @returns {string} The hint text. | ||
*/ | ||
get text(){ | ||
return this._text; | ||
} | ||
// Getter and setter for texts | ||
get texts() { | ||
return this._texts; | ||
} | ||
|
||
set text(value){ | ||
this._text = value; | ||
} | ||
set texts(val) { | ||
// Validate that texts is not empty | ||
if (!val || _.isEmpty(val)) throw new TypeError('Invalid texts'); | ||
this._texts = val; | ||
} | ||
|
||
get type(){ | ||
return this._type; | ||
} | ||
// Getter and setter for type | ||
get type() { | ||
return this._type; | ||
} | ||
|
||
set type(value){ | ||
if( value != HINT_TYPES.ORGANISM){ | ||
throw new TypeError('Invalid type' + value); | ||
} | ||
this._type = value; | ||
} | ||
set type(val) { | ||
// Validate that type is one of the predefined HINT_TYPES | ||
if (!_.includes(HINT_TYPES, val)) throw new TypeError('Invalid type'); | ||
this._type = val; | ||
} | ||
|
||
get xref(){ | ||
return this._xref; | ||
} | ||
// Getter and setter for xref | ||
get xref() { | ||
return this._xref; | ||
} | ||
|
||
set xref(value){ | ||
if (!value.dbPrefix || !value.id){ | ||
throw new TypeError('Invalid xref' + JSON.stringify(value)); | ||
} | ||
this._xref = value; | ||
} | ||
set xref(val) { | ||
// Validate that xref has dbPrefix and id properties | ||
if (!val.dbPrefix || !val.id) throw new TypeError('Invalid xref'); | ||
this._xref = val; | ||
} | ||
|
||
get section(){ | ||
return this._section; | ||
} | ||
// Getter and setter for section | ||
get section() { | ||
return this._section; | ||
} | ||
|
||
set section(value){ | ||
if( ! _.includes(PASSAGE_TYPES, value) ) { | ||
throw new TypeError('Invalid section' + value); | ||
} | ||
this._section = value; | ||
} | ||
set section(val) { | ||
// Validate that section is one of the predefined SECTIONS | ||
if (!_.includes(SECTIONS, val)) throw new TypeError('Invalid section'); | ||
this._section = val; | ||
} | ||
} | ||
export { | ||
Hint, | ||
HINT_TYPE, | ||
PASSAGE_TYPE | ||
}; | ||
|
||
export { Hint, HINT_TYPE, SECTION }; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,163 @@ | ||
import _ from 'lodash'; | ||
import { Hint, HINT_TYPE } from '../../../../../model/hint.js'; | ||
import { COLLECTIONS } from '../../../../../util/registry.js'; | ||
|
||
/** | ||
* Map a PubTator BioCDocument to a hint | ||
* @param {object} bioCDocument as defined by [NLM DTD]{@link ftp://ftp.ncbi.nlm.nih.gov/pub/wilbur/BioC-PMC/BioC.dtd} | ||
* @returns {Array.<Hint>} hints a set of hints | ||
*/ | ||
function map(bioCDocument) { | ||
let hints = []; | ||
|
||
// See Table 1 https://www.ncbi.nlm.nih.gov/research/pubtator3/tutorial | ||
const PUBTATOR_ANNOTATION_TYPE = Object.freeze({ | ||
SPECIES: 'Species', | ||
// could add more types here when scope expands | ||
}); | ||
const PUBTATOR_DATABASE = Object.freeze({ | ||
ncbi_taxonomy: 'ncbi_taxonomy', | ||
// could add more databases here when scope expands | ||
}); | ||
const entityTypes = new Map([ | ||
[PUBTATOR_ANNOTATION_TYPE.SPECIES, HINT_TYPE.ORGANISM], | ||
]); | ||
const database2Xref = new Map([ | ||
[PUBTATOR_DATABASE.ncbi_taxonomy, COLLECTIONS.NCBI_TAXONOMY], | ||
]); | ||
|
||
/** | ||
* Checks if the type of a given annotation is valid based on a predefined list of valid types. | ||
* | ||
* This function extracts the `type` field from the `infons` object of an annotation | ||
* and checks if this type is included in the `PUBTATOR_ANNOTATION_TYPE` array. | ||
* | ||
* @param {Object} annotation - An annotation object containing an `infons` object with metadata. | ||
* @returns {boolean} - Returns `true` if the `type` of the annotation is included in `PUBTATOR_ANNOTATION_TYPE`, otherwise `false`. | ||
*/ | ||
const isValidType = (annotation) => { | ||
const { | ||
infons: { type }, | ||
} = annotation; | ||
return _.includes(PUBTATOR_ANNOTATION_TYPE, type); | ||
}; | ||
|
||
/** | ||
* Checks if the xref (cross-reference) of a given annotation is valid. | ||
* | ||
* This function performs a series of checks on the `identifier` field within the `infons` object of an annotation | ||
* to determine its validity. It checks for the presence of an identifier, ensures it is not null or undefined, | ||
* ensures it is not an empty string or a dash, and checks that it is not a semi-colon delimited string. | ||
* | ||
* @param {Object} annotation - An annotation object containing an `infons` object with metadata. | ||
* @returns {boolean} - Returns `true` if the `identifier` of the annotation passes all validation checks, otherwise `false`. | ||
* | ||
* Validation Checks: | ||
* - The `identifier` must exist. | ||
* - The `identifier` must not be null or undefined. | ||
* - The `identifier` must not be an empty string or a dash ('-'). | ||
* - The `identifier` must not be a semi-colon delimited string. | ||
*/ | ||
const isValidXref = (annotation) => { | ||
const EMPTY_SYMBOLS = new Set(['-', '']); | ||
let isValid = false; | ||
// Check if there is an identifier | ||
const hasId = (a) => _.has(a, ['infons', 'identifier']); | ||
// Check if the identifier value is null or undefined | ||
const isNil = (a) => { | ||
const id = _.get(a, ['infons', 'identifier']); | ||
return _.isNil(id); | ||
}; | ||
// Check if the identifier value is empty or a dash | ||
const isEmpty = (a) => { | ||
const id = _.get(a, ['infons', 'identifier']); | ||
return EMPTY_SYMBOLS.has(id); | ||
}; | ||
// Check if the identifier value is semi-colon delimited | ||
const isSemiColonDelimited = (a) => { | ||
const id = _.get(a, ['infons', 'identifier']); | ||
const ids = _.compact(id.split(';')); | ||
return ids.length > 1; | ||
}; | ||
if ( | ||
hasId(annotation) && | ||
!isNil(annotation) && | ||
!isEmpty(annotation) && | ||
!isSemiColonDelimited(annotation) | ||
) { | ||
isValid = true; | ||
} | ||
return isValid; | ||
}; | ||
|
||
/** | ||
* Groups a list of annotation objects by their database and identifier (xref) fields. | ||
* | ||
* This function processes an array of annotations, grouping them based on a composite key | ||
* created from the `database` and `identifier` fields in the `infons` object of each annotation. | ||
* After grouping, it transforms each group into an object containing the `infons` from the first | ||
* annotation in the group and an array of all `text` fields from the annotations in that group. | ||
* | ||
* @param {Array} annotations - An array of annotation objects, each containing an `infons` object with metadata and a `text` field. | ||
* @returns {Array} - An array of objects, where each object contains: | ||
* - `infons`: The metadata from the first annotation in the group. | ||
* - `texts`: An array of `text` fields from all annotations in the group. | ||
* | ||
*/ | ||
const groupByXref = (annotations) => { | ||
const byXref = ({ infons }) => `${infons.database}_${infons.identifier}`; | ||
let groups = _.groupBy(annotations, byXref); | ||
groups = Object.values(groups).map((group) => { | ||
const texts = group.map((a) => a.text); | ||
const first = _.first(group); | ||
const core = _.pick(first, ['infons']); | ||
return _.assign(core, { texts }); | ||
}); | ||
return groups; | ||
}; | ||
|
||
/** | ||
* Converts an annotation object into a Hint object. | ||
* | ||
* This function extracts relevant fields from an annotation object and constructs a new Hint object. | ||
* It retrieves the identifier, database, and type from the `infons` object of the annotation, maps the database to an xref, | ||
* and maps the type to an entity type. It then creates a new Hint object using these extracted and mapped values, along with the provided section. | ||
* | ||
* @param {Object} annotation - An annotation object containing `infons` and `texts` fields. | ||
* @param {string} section - A string representing the section of the document where the annotation was found. e.g., 'title' or 'abstract'. | ||
* @returns {Hint} - Returns a new Hint object constructed from the annotation data and section. | ||
*/ | ||
const toHint = (annotation, section) => { | ||
// Destructure the relevant fields from the annotation | ||
const { | ||
texts, | ||
infons: { identifier: id, database, type }, | ||
} = annotation; | ||
// Assign the id, dbName, and dbPrefix to an xref object | ||
const xref = _.assign({ id }, database2Xref.get(database)); | ||
// Map the type to an entity type | ||
const eType = entityTypes.get(type); | ||
// Create a new Hint object with the extracted and mapped values | ||
const hint = new Hint(texts, eType, xref, section); | ||
|
||
return hint; | ||
}; | ||
|
||
let { passages } = bioCDocument; | ||
|
||
for (const passage of passages) { | ||
let { annotations } = passage; | ||
const section = passage.infons.type; | ||
annotations = _.filter(annotations, isValidType); | ||
annotations = _.filter(annotations, isValidXref); | ||
annotations = groupByXref(annotations); | ||
|
||
annotations.forEach((a) => { | ||
const hint = toHint(a, section); | ||
hints.push(hint); | ||
}); | ||
} | ||
return hints; | ||
} | ||
|
||
export default map; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
const COLLECTIONS = Object.freeze({ | ||
PUBMED: { | ||
dbname: 'PubMed', | ||
dbPrefix: 'pubmed' | ||
}, | ||
/** | ||
* The Taxonomy Database is a curated classification and nomenclature for all of the organisms | ||
* in the public sequence databases. This currently represents about 10% of the described species of life on the planet. | ||
*/ | ||
NCBI_TAXONOMY: { | ||
dbname: 'NCBI Taxonomy', | ||
dbPrefix: 'NCBITaxon' | ||
} | ||
}); | ||
|
||
export { COLLECTIONS }; | ||
|
Oops, something went wrong.