Skip to content

Commit

Permalink
Merge pull request #1293 from PathwayCommons/iss1148_testing-hints-ma…
Browse files Browse the repository at this point in the history
…pper

Testing organism hints mapper for BioC documents.
  • Loading branch information
jvwong authored Aug 8, 2024
2 parents e9e05e4 + d0c02d4 commit dbd1ca0
Show file tree
Hide file tree
Showing 10 changed files with 3,390 additions and 3,281 deletions.
133 changes: 66 additions & 67 deletions src/model/hint.js
Original file line number Diff line number Diff line change
@@ -1,85 +1,84 @@
import _ from 'lodash';

// Define constants for Hint types, combining ORGANISM with ENTITY_TYPE
const HINT_TYPE = Object.freeze({
ORGANISM: 'organism'
ORGANISM: 'organism',
});

// Flatten the HINT_TYPE object to create an array of all hint types
const HINT_TYPES = _.flatMap(HINT_TYPE);

const PASSAGE_TYPE = Object.freeze({
TITLE: 'title',
ABSTRACT: 'abstract'
// Define constants for sections of a document
const SECTION = Object.freeze({
TITLE: 'title',
ABSTRACT: 'abstract',
});
const PASSAGE_TYPES = _.flatMap(PASSAGE_TYPE);

/*
* Class representing a Hint.
* A hint is a piece of information that is extracted from sections of articles, such as the title or abstract.
* It can be about the organism being studied in the article or a gene name under investigation.
*/
class Hint{
// Flatten the SECTION object to create an array of all sections
const SECTIONS = _.flatMap(SECTION);

/**
* Create a Hint.
* @param {string} text - The hint text.
* @param {string} type - The hint type.
* @param {Object} xref - The hint xref.
* @param {string} section - The hint section.
*/
constructor(text, type, xref, section){
this._text = text;
this._type = type;
this._xref = xref;
this._section = section;
}
/**
* Representing a bioentity mention and ground
*/
class Hint {
/**
* Creates an instance of Hint.
* @param {Array} param.texts - The texts associated with the hint.
* @param {string} param.type - The type of the hint.
* @param {Object} param.xref - The cross-reference (xref) object.
* @param {string} param.section - The section of the document where the hint was found.
*/
constructor(texts, type, xref, section) {
// Initialize the properties using the setters to enforce validation
this._texts = texts;
this._type = type;
this._xref = xref;
this._section = section;
}

/**
* Get the hint text.
* The text is a single mention extracted from the article section.
* @returns {string} The hint text.
*/
get text(){
return this._text;
}
// Getter and setter for texts
get texts() {
return this._texts;
}

set text(value){
this._text = value;
}
set texts(val) {
// Validate that texts is not empty
if (!val || _.isEmpty(val)) throw new TypeError('Invalid texts');
this._texts = val;
}

get type(){
return this._type;
}
// Getter and setter for type
get type() {
return this._type;
}

set type(value){
if( value != HINT_TYPES.ORGANISM){
throw new TypeError('Invalid type' + value);
}
this._type = value;
}
set type(val) {
// Validate that type is one of the predefined HINT_TYPES
if (!_.includes(HINT_TYPES, val)) throw new TypeError('Invalid type');
this._type = val;
}

get xref(){
return this._xref;
}
// Getter and setter for xref
get xref() {
return this._xref;
}

set xref(value){
if (!value.dbPrefix || !value.id){
throw new TypeError('Invalid xref' + JSON.stringify(value));
}
this._xref = value;
}
set xref(val) {
// Validate that xref has dbPrefix and id properties
if (!val.dbPrefix || !val.id) throw new TypeError('Invalid xref');
this._xref = val;
}

get section(){
return this._section;
}
// Getter and setter for section
get section() {
return this._section;
}

set section(value){
if( ! _.includes(PASSAGE_TYPES, value) ) {
throw new TypeError('Invalid section' + value);
}
this._section = value;
}
set section(val) {
// Validate that section is one of the predefined SECTIONS
if (!_.includes(SECTIONS, val)) throw new TypeError('Invalid section');
this._section = val;
}
}
export {
Hint,
HINT_TYPE,
PASSAGE_TYPE
};

export { Hint, HINT_TYPE, SECTION };
163 changes: 163 additions & 0 deletions src/server/routes/api/document/hint/pubtator.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
import _ from 'lodash';
import { Hint, HINT_TYPE } from '../../../../../model/hint.js';
import { COLLECTIONS } from '../../../../../util/registry.js';

/**
* Map a PubTator BioCDocument to a hint
* @param {object} bioCDocument as defined by [NLM DTD]{@link ftp://ftp.ncbi.nlm.nih.gov/pub/wilbur/BioC-PMC/BioC.dtd}
* @returns {Array.<Hint>} hints a set of hints
*/
function map(bioCDocument) {
let hints = [];

// See Table 1 https://www.ncbi.nlm.nih.gov/research/pubtator3/tutorial
const PUBTATOR_ANNOTATION_TYPE = Object.freeze({
SPECIES: 'Species',
// could add more types here when scope expands
});
const PUBTATOR_DATABASE = Object.freeze({
ncbi_taxonomy: 'ncbi_taxonomy',
// could add more databases here when scope expands
});
const entityTypes = new Map([
[PUBTATOR_ANNOTATION_TYPE.SPECIES, HINT_TYPE.ORGANISM],
]);
const database2Xref = new Map([
[PUBTATOR_DATABASE.ncbi_taxonomy, COLLECTIONS.NCBI_TAXONOMY],
]);

/**
* Checks if the type of a given annotation is valid based on a predefined list of valid types.
*
* This function extracts the `type` field from the `infons` object of an annotation
* and checks if this type is included in the `PUBTATOR_ANNOTATION_TYPE` array.
*
* @param {Object} annotation - An annotation object containing an `infons` object with metadata.
* @returns {boolean} - Returns `true` if the `type` of the annotation is included in `PUBTATOR_ANNOTATION_TYPE`, otherwise `false`.
*/
const isValidType = (annotation) => {
const {
infons: { type },
} = annotation;
return _.includes(PUBTATOR_ANNOTATION_TYPE, type);
};

/**
* Checks if the xref (cross-reference) of a given annotation is valid.
*
* This function performs a series of checks on the `identifier` field within the `infons` object of an annotation
* to determine its validity. It checks for the presence of an identifier, ensures it is not null or undefined,
* ensures it is not an empty string or a dash, and checks that it is not a semi-colon delimited string.
*
* @param {Object} annotation - An annotation object containing an `infons` object with metadata.
* @returns {boolean} - Returns `true` if the `identifier` of the annotation passes all validation checks, otherwise `false`.
*
* Validation Checks:
* - The `identifier` must exist.
* - The `identifier` must not be null or undefined.
* - The `identifier` must not be an empty string or a dash ('-').
* - The `identifier` must not be a semi-colon delimited string.
*/
const isValidXref = (annotation) => {
const EMPTY_SYMBOLS = new Set(['-', '']);
let isValid = false;
// Check if there is an identifier
const hasId = (a) => _.has(a, ['infons', 'identifier']);
// Check if the identifier value is null or undefined
const isNil = (a) => {
const id = _.get(a, ['infons', 'identifier']);
return _.isNil(id);
};
// Check if the identifier value is empty or a dash
const isEmpty = (a) => {
const id = _.get(a, ['infons', 'identifier']);
return EMPTY_SYMBOLS.has(id);
};
// Check if the identifier value is semi-colon delimited
const isSemiColonDelimited = (a) => {
const id = _.get(a, ['infons', 'identifier']);
const ids = _.compact(id.split(';'));
return ids.length > 1;
};
if (
hasId(annotation) &&
!isNil(annotation) &&
!isEmpty(annotation) &&
!isSemiColonDelimited(annotation)
) {
isValid = true;
}
return isValid;
};

/**
* Groups a list of annotation objects by their database and identifier (xref) fields.
*
* This function processes an array of annotations, grouping them based on a composite key
* created from the `database` and `identifier` fields in the `infons` object of each annotation.
* After grouping, it transforms each group into an object containing the `infons` from the first
* annotation in the group and an array of all `text` fields from the annotations in that group.
*
* @param {Array} annotations - An array of annotation objects, each containing an `infons` object with metadata and a `text` field.
* @returns {Array} - An array of objects, where each object contains:
* - `infons`: The metadata from the first annotation in the group.
* - `texts`: An array of `text` fields from all annotations in the group.
*
*/
const groupByXref = (annotations) => {
const byXref = ({ infons }) => `${infons.database}_${infons.identifier}`;
let groups = _.groupBy(annotations, byXref);
groups = Object.values(groups).map((group) => {
const texts = group.map((a) => a.text);
const first = _.first(group);
const core = _.pick(first, ['infons']);
return _.assign(core, { texts });
});
return groups;
};

/**
* Converts an annotation object into a Hint object.
*
* This function extracts relevant fields from an annotation object and constructs a new Hint object.
* It retrieves the identifier, database, and type from the `infons` object of the annotation, maps the database to an xref,
* and maps the type to an entity type. It then creates a new Hint object using these extracted and mapped values, along with the provided section.
*
* @param {Object} annotation - An annotation object containing `infons` and `texts` fields.
* @param {string} section - A string representing the section of the document where the annotation was found. e.g., 'title' or 'abstract'.
* @returns {Hint} - Returns a new Hint object constructed from the annotation data and section.
*/
const toHint = (annotation, section) => {
// Destructure the relevant fields from the annotation
const {
texts,
infons: { identifier: id, database, type },
} = annotation;
// Assign the id, dbName, and dbPrefix to an xref object
const xref = _.assign({ id }, database2Xref.get(database));
// Map the type to an entity type
const eType = entityTypes.get(type);
// Create a new Hint object with the extracted and mapped values
const hint = new Hint(texts, eType, xref, section);

return hint;
};

let { passages } = bioCDocument;

for (const passage of passages) {
let { annotations } = passage;
const section = passage.infons.type;
annotations = _.filter(annotations, isValidType);
annotations = _.filter(annotations, isValidXref);
annotations = groupByXref(annotations);

annotations.forEach((a) => {
const hint = toHint(a, section);
hints.push(hint);
});
}
return hints;
}

export default map;
17 changes: 17 additions & 0 deletions src/util/registry.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
const COLLECTIONS = Object.freeze({
PUBMED: {
dbname: 'PubMed',
dbPrefix: 'pubmed'
},
/**
* The Taxonomy Database is a curated classification and nomenclature for all of the organisms
* in the public sequence databases. This currently represents about 10% of the described species of life on the planet.
*/
NCBI_TAXONOMY: {
dbname: 'NCBI Taxonomy',
dbPrefix: 'NCBITaxon'
}
});

export { COLLECTIONS };

Loading

0 comments on commit dbd1ca0

Please sign in to comment.