Skip to content

Commit

Permalink
refactor code, tests pass
Browse files Browse the repository at this point in the history
ready to add gaps, tests pass

final refactor
  • Loading branch information
manulera committed May 29, 2024
1 parent d1e8217 commit dcdaf44
Showing 1 changed file with 190 additions and 116 deletions.
306 changes: 190 additions & 116 deletions packages/sequence-utils/src/getAminoAcidDataForEachBaseOfDna.js
Original file line number Diff line number Diff line change
@@ -1,24 +1,60 @@
import { translateRange, getSequenceWithinRange } from "@teselagen/range-utils";
import {
translateRange,
getSequenceWithinRange,
flipContainedRange
} from "@teselagen/range-utils";
import revComp from "./getReverseComplementSequenceString";
import getAA from "./getAminoAcidFromSequenceTriplet";

//
import proteinAlphabet from "./proteinAlphabet";

// ac.throw([ac.string,ac.bool],arguments);
/**
* @private
* Gets aminoAcid data, including position in string and position in codon
* from the sequenceString and the direction of the translation
* Gets the next triplet of bases in the sequenceString
* @param {Number} index The index of the sequenceString to start at
* @param {String} sequenceString The dna sequenceString.
* @param {boolean} forward Should we find forward facing orfs or reverse facing orfs
* @param {boolean} isProteinSequence We're passing in a sequence of AA chars instead of DNA chars (slightly confusing but we'll still use the dna indexing for rendering in OVE)
* @return [{
aminoAcid:
positionInCodon:
}]
* @return {Object} The triplet of bases, the number of bases read, and the positions of the codon bases in the sequenceString
* @property {String} triplet The triplet of bases
* @property {Number} basesRead The number of bases read
* @property {Number[]} codonPositions The positions of the codon bases in the sequenceString
*/
export default function getAminoAcidDataForEachBaseOfDna(
function getNextTriplet(index, sequenceString) {
let triplet = "";
let internalIndex = index;
// Positions of codons relative to the coding sequence
// including introns.
const codonPositions = [];
while (triplet.length < 3) {
if (sequenceString[internalIndex]) {
triplet += sequenceString[internalIndex];
codonPositions.push(internalIndex);
internalIndex++;
} else {
break;
}
}
return { triplet, basesRead: internalIndex - index, codonPositions };
}

/**
* @private
* Returns a series of derived properties from the arguments to getAminoAcidDataForEachBaseOfDna
* @param {String} originalSequenceString The dna sequenceString.
* @param {boolean} forward Whether the translation is in the forward direction.
* @param {Object} optionalSubrangeRange The range of the sequenceString to translate.
* @param {boolean} isProteinSequence Whether the sequenceString is a protein sequence.
* @return {Object} The derived properties
* @property {String} sequenceString
* - If !isProtein: The subsequence within originalSequenceString that will be translated, defined by transaltionRange. If
* !forward, this will be the reverse complement of the subsequence.
* - If isProtein: The originalSequenceString.
* @property {Object} translationRange The range of the originalSequenceString that we're translating (if !isProtein), or getting DNA-level
info for (if isProtein).
* @property {Number} originalSequenceStringLength The length of the full DNA sequence. If !isProtein it's the length of originalSequenceString
* @property {Number} sequenceStringLength The length of the DNA sequence that would give the translation.
*/
function getTranslatedSequenceProperties(
originalSequenceString,
forward,
optionalSubrangeRange,
Expand All @@ -27,137 +63,175 @@ export default function getAminoAcidDataForEachBaseOfDna(
const originalSequenceStringLength = isProteinSequence
? originalSequenceString.length * 3
: originalSequenceString.length;

let sequenceString = originalSequenceString;
let startOffset = 0;
const translationRange = { start: 0, end: originalSequenceStringLength - 1 };

if (optionalSubrangeRange) {
sequenceString = getSequenceWithinRange(
optionalSubrangeRange,
originalSequenceString
);
startOffset = optionalSubrangeRange.start;
translationRange.start = optionalSubrangeRange.start;
translationRange.end = optionalSubrangeRange.end;
}

const sequenceStringLength = isProteinSequence
? sequenceString.length * 3
: sequenceString.length;

// ac.throw([ac.string,ac.bool],arguments);
const aminoAcidDataForEachBaseOfDNA = [];
let codonRange;
let revCompGapLength = 0;
let aminoAcidIndex = 0;
if (!isProteinSequence && !forward) {
sequenceString = revComp(sequenceString);
}

// If a CDS has introns, make a list with the positions of the intron bases
if (
!isProteinSequence &&
optionalSubrangeRange &&
optionalSubrangeRange.positions
) {
// TODO: Implement intronBases
// const intronBases = [];
}

return {
sequenceString,
translationRange,
sequenceStringLength,
originalSequenceStringLength
};
}

/**
* Function to convert the position within the CDS (where A in ATG is 0, and T in ATG is 1)
* to the position in the main sequence
*
* @param {Number} index The index of the sequenceString to start at
* @param {boolean} forward Whether the translation is in the forward direction.
* @param {Object} translationRange The range of the originalSequenceString that we're translating (if !isProtein), or getting DNA-level
* info for (if isProtein).
* @param {Number} mainSequenceLength The length of the full DNA sequence. If !isProtein it's the length of originalSequenceString
* @return {Number} The position in the main sequence
*
*/
function positionInCdsToPositionInMainSequence(
index,
forward,
translationRange,
mainSequenceLength
) {
let outputRange = translateRange(
{ start: index, end: index },
translationRange.start,
mainSequenceLength
);
if (!forward) {
//compute the start of the amino acid sequence, but only if translating in the reverse direction
aminoAcidIndex = Math.floor((sequenceStringLength - 1) / 3);
//because we're translating in the reverse direction, we need to
//check to see if there are untranslated amino acids at the start of the sequenceString
revCompGapLength = sequenceStringLength % 3;
codonRange = translateRange(
{
start: 0,
end: revCompGapLength - 1
},
startOffset,
originalSequenceStringLength
outputRange = flipContainedRange(
outputRange,
translationRange,
mainSequenceLength
);

if (revCompGapLength > 0) {
for (let i = 0; i < revCompGapLength; i++) {
aminoAcidDataForEachBaseOfDNA.push({
aminoAcid: getAA("xxx"), //fake xxx triplet returns the ambiguous X amino acid
positionInCodon: revCompGapLength - i - 1,
aminoAcidIndex,
sequenceIndex: codonRange.start + i,
codonRange,
fullCodon: false
});
}
aminoAcidIndex--;
}
}
return outputRange.start;
}

//compute the bulk of the sequence
for (
let index = 2 + revCompGapLength;
index < sequenceStringLength;
index += 3
) {
/**
* @private
* Gets aminoAcid data, including position in string and position in codon
* from the sequenceString and the direction of the translation
* @param {String} sequenceString The dna sequenceString.
* @param {boolean} forward Should we find forward facing orfs or reverse facing orfs
* @param {boolean} isProteinSequence We're passing in a sequence of AA chars instead of DNA chars (slightly confusing but we'll still use the dna indexing for rendering in OVE)
* @return [{
aminoAcid:
positionInCodon:
}]
*/
export default function getAminoAcidDataForEachBaseOfDna(
originalSequenceString,
forward,
optionalSubrangeRange,
isProteinSequence
) {
// Obtain derived properties, see getTranslatedSequenceProperties
const {
sequenceString,
translationRange,
sequenceStringLength,
originalSequenceStringLength
} = getTranslatedSequenceProperties(
originalSequenceString,
forward,
optionalSubrangeRange,
isProteinSequence
);

const aminoAcidDataForEachBaseOfDNA = [];
// Iterate over the DNA sequence length in increments of 3
for (let index = 0; index < sequenceStringLength; index += 3) {
let aminoAcid;
const aminoAcidIndex = index / 3;
let codonPositionsInCDS;

if (isProteinSequence) {
aminoAcid =
proteinAlphabet[sequenceString[(index - 2) / 3].toUpperCase()];
codonPositionsInCDS = [0, 1, 2].map(i => index + i);
aminoAcid = proteinAlphabet[sequenceString[index / 3].toUpperCase()];
} else {
let triplet = sequenceString.slice(index - 2, index + 1);
if (!forward) {
//we reverse the triplet
triplet = revComp(triplet);
}
aminoAcid = getAA(triplet);
// Get the triplet of DNA bases
const {
triplet,
basesRead,
codonPositions: _codonPositions
} = getNextTriplet(index, sequenceString);
codonPositionsInCDS = _codonPositions;

// If the triplet is not full, we need to add the gap xxx amino acid,
aminoAcid = triplet.length === 3 ? getAA(triplet) : getAA("xxx");
index += basesRead - triplet.length;
// TODO - add intron bases
// if (index === 3) {
// aminoAcidDataForEachBaseOfDNA.push({});
// }
}
codonRange = translateRange(
{
start: index - 2,
end: index
},
startOffset,
originalSequenceStringLength

const absoluteCodonPositions = codonPositionsInCDS.map(i =>
positionInCdsToPositionInMainSequence(
i,
forward,
translationRange,
originalSequenceStringLength
)
);

aminoAcidDataForEachBaseOfDNA.push({
aminoAcid, //gap amino acid
positionInCodon: forward ? 0 : 2,
aminoAcidIndex,
sequenceIndex: codonRange.start,
codonRange,
fullCodon: true
});
aminoAcidDataForEachBaseOfDNA.push({
aminoAcid, //gap amino acid
positionInCodon: 1,
aminoAcidIndex,
sequenceIndex: codonRange.start + 1,
codonRange,
fullCodon: true
});
aminoAcidDataForEachBaseOfDNA.push({
aminoAcid, //gap amino acid
positionInCodon: forward ? 2 : 0,
aminoAcidIndex,
sequenceIndex: codonRange.start + 2,
codonRange,
fullCodon: true
});
if (forward) {
aminoAcidIndex++;
} else {
aminoAcidIndex--;
}
}
const codonRange = forward
? {
start: absoluteCodonPositions[0],
end: absoluteCodonPositions[codonPositionsInCDS.length - 1]
}
: {
start: absoluteCodonPositions[codonPositionsInCDS.length - 1],
end: absoluteCodonPositions[0]
};

//compute the end of the sequence
//we'll never hit the following logic if translating in the reverse direction
const lengthOfEndBpsNotCoveredByAminoAcids =
sequenceStringLength - aminoAcidDataForEachBaseOfDNA.length;
codonRange = translateRange(
{
start: sequenceStringLength - lengthOfEndBpsNotCoveredByAminoAcids,
end: sequenceStringLength - 1
},
startOffset,
originalSequenceStringLength
);
for (let j = 0; j < lengthOfEndBpsNotCoveredByAminoAcids; j++) {
aminoAcidDataForEachBaseOfDNA.push({
aminoAcid: getAA("xxx"), //fake xxx triplet returns the gap amino acid
positionInCodon: j,
aminoAcidIndex,
sequenceIndex: codonRange.start + j,
fullCodon: false,
codonRange
});
for (let i = 0; i < codonPositionsInCDS.length; i++) {
aminoAcidDataForEachBaseOfDNA.push({
aminoAcid,
positionInCodon: i,
aminoAcidIndex,
sequenceIndex: absoluteCodonPositions[i],
codonRange,
fullCodon: codonPositionsInCDS.length === 3
});
}
}

if (sequenceStringLength !== aminoAcidDataForEachBaseOfDNA.length) {
throw new Error("something went wrong!");
}

// Reverse the array if we're translating in the reverse direction
if (!forward) {
aminoAcidDataForEachBaseOfDNA.reverse();
}
return aminoAcidDataForEachBaseOfDNA;
}

0 comments on commit dcdaf44

Please sign in to comment.