diff --git a/external/dataIntegration/mCADRE/pruningModel.m b/external/dataIntegration/mCADRE/pruningModel.m index 5c4b7d76ec..33ad4e53de 100644 --- a/external/dataIntegration/mCADRE/pruningModel.m +++ b/external/dataIntegration/mCADRE/pruningModel.m @@ -66,7 +66,17 @@ model_rem = removeRxns(tissueModel, r); end % Check for inactive reactions after removal of r - [fluxConsistentMetBool,fluxConsistentRxnBool] = findFluxConsistentSubset(model_rem,paramConsistency); + try + [fluxConsistentMetBool,fluxConsistentRxnBool] = findFluxConsistentSubset(model_rem,paramConsistency); + rStatus_and_not_error = true; + catch + rStatus_and_not_error = false; + end + else + rStatus_and_not_error = false; + end + + if rStatus_and_not_error inactive_G= [ r; model_rem.rxns(fluxConsistentRxnBool==0)]; inactiveCore = intersect(inactive_G, coreRxn); diff --git a/src/analysis/multiSpecies/microbiomeModelingToolbox/additionalAnalysis/analyzeMgPipeResults.m b/src/analysis/multiSpecies/microbiomeModelingToolbox/additionalAnalysis/analyzeMgPipeResults.m index 0eaa701a12..8def2a7e93 100644 --- a/src/analysis/multiSpecies/microbiomeModelingToolbox/additionalAnalysis/analyzeMgPipeResults.m +++ b/src/analysis/multiSpecies/microbiomeModelingToolbox/additionalAnalysis/analyzeMgPipeResults.m @@ -113,7 +113,7 @@ function analyzeMgPipeResults(infoFilePath,resPath,varargin) cd(violinPath) % create violin plots for net uptake and secretion files - if any(contains(fileList{i,1},{'net_uptake_fluxes.csv','net_secretion_fluxes.csv'})) + if any(strcmp(fileList{i,1},{'net_uptake_fluxes.csv','net_secretion_fluxes.csv'})) makeViolinPlots(sampleData, infoFile, 'stratification',sampleGroupHeaders{j}, 'plottedFeature', filename, 'unit', 'mmol/person/day') end cd(currentDir) diff --git a/src/analysis/multiSpecies/microbiomeModelingToolbox/additionalAnalysis/calculateReactionAbundance.m b/src/analysis/multiSpecies/microbiomeModelingToolbox/additionalAnalysis/calculateReactionAbundance.m old mode 100755 new mode 100644 index 177534c0ee..e4a885e71a --- a/src/analysis/multiSpecies/microbiomeModelingToolbox/additionalAnalysis/calculateReactionAbundance.m +++ b/src/analysis/multiSpecies/microbiomeModelingToolbox/additionalAnalysis/calculateReactionAbundance.m @@ -1,438 +1,357 @@ -function [ReactionAbundance,TaxonomyInfo] = calculateReactionAbundance(abundancePath, modelPath, infoFilePath, rxnsList, numWorkers) -% Part of the Microbiome Modeling Toolbox. This function calculates and -% plots the total abundance of reactions of interest in a given microbiome -% sample based on the strain-level composition. -% Reaction presence or absence in each strain is derived from the reaction content -% of the respective AGORA model. Two results are given: the total abundance, -% and the abundance on different taxonomical levels. -% -% USAGE -% -% [ReactionAbundance,TaxonomyInfo] = calculateReactionAbundance(abundancePath, modelPath, infoFilePath, rxnsList, numWorkers) -% -% INPUTS: -% abundancePath: Path to the .csv file with the abundance data. -% Example: 'cobratoolbox/papers/018_microbiomeModelingToolbox/examples/normCoverage.csv' -% modelPath: Folder containing the strain-specific AGORA models -% OPTIONAL INPUTS: -% infoFilePath: Path to the spreadsheet with the taxonomy -% information on organisms (default: -% AGORA_infoFile.xlsx) -% rxnsList: List of reactions for which the abundance -% should be calculated (if left empty: all -% reactions in all models) -% numWorkers: Number of workers used for parallel pool. If -% left empty, the parallel pool will not be -% started. Parallellization is recommended if -% all reactions are computed. -% -% OUTPUT: -% ReactionAbundance Structure with abundance for each microbiome -% and reaction in total and on taxon levels -% TaxonomyInfo: Taxonomical information on each taxon level -% -% .. Author: - Almut Heinken, 03/2018 -% 10/2018: changed input to location of the csv file with the -% abundance data -% 01/2020: adapted to be suitable for pan-models - -% read the csv file with the abundance data -abundance = readtable(abundancePath, 'ReadVariableNames', false); -abundance = table2cell(abundance); -if isnumeric(abundance{2, 1}) - abundance(:, 1) = []; -end - -% - -% load the models -for i = 2:size(abundance, 1) - model = readCbModel([modelPath filesep abundance{i, 1} '.mat']); - modelsList{i, 1} = model; -end - -if ~exist('rxnsList', 'var') || isempty(rxnsList) % define reaction list if not entered - fprintf('No reaction list entered. Abundances will be calculated for all reactions in all models. \n') - % get model list from abundance input file - for i = 2:size(abundance, 1) - model = modelsList{i, 1}; - rxnsList = vertcat(model.rxns, rxnsList); - end - rxnsList = unique(rxnsList); -end - -% Get the taxonomy information -if exist('infoFilePath','var') - taxonomy = readtable(infoFilePath, 'ReadVariableNames', false); - taxonomy = table2cell(taxonomy); -else - taxonomy = readtable('AGORA_infoFile.xlsx', 'ReadVariableNames', false); - taxonomy = table2cell(taxonomy); -end - -% load the models found in the individuals and extract which reactions are -% in which model -for i = 2:size(abundance, 1) - model = modelsList{i, 1}; - ReactionPresence{i, 1} = abundance{i, 1}; - for j = 1:length(rxnsList) - ReactionPresence{1, j + 1} = rxnsList{j}; - if ~isempty(find(ismember(model.rxns, rxnsList{j}))) - ReactionPresence{i, j + 1} = 1; - else - ReactionPresence{i, j + 1} = 0; - end - end -end - -% put together a Matlab structure of the results -ReactionAbundance = struct; - -% prepare table for the total abundance -for j = 1:length(rxnsList) - ReactionAbundance.('Total'){1, j + 1} = rxnsList{j}; -end - -TaxonomyLevels = { - 'Phylum' - 'Class' - 'Order' - 'Family' - 'Genus' - 'Species' -}; -% extract the list of entries on each taxonomical level -for t = 1:size(TaxonomyLevels, 1) - % find the columns corresponding to each taxonomy level and the list of - % unique taxa - taxonCol = find(strcmp(taxonomy(1, :), TaxonomyLevels{t})); - % find and save all entries - taxa = unique(taxonomy(2:end, taxonCol)); - % exclude unclassified entries - taxa(strncmp('unclassified', taxa, taxonCol)) = []; - TaxonomyLevels{t, 2} = taxa; - % define the correct columns in taxonomy table - TaxonomyLevels{t, 3} = taxonCol; - % prepare table for the abundance on taxon levels - cnt = 2; - for j = 1:length(rxnsList) - for l = 1:length(TaxonomyLevels{t, 2}) - ReactionAbundance.(TaxonomyLevels{t, 1}){1, cnt} = strcat(TaxonomyLevels{t, 2}{l}, '_', rxnsList{j}); - cnt = cnt + 1; - end - end -end - -% Find the right column for the input data (strains, species,..) -taxa=taxonomy(2:end,1); -if length(intersect(abundance(2:end,1),taxa))==size(abundance,1)-1 - inputTaxa=taxa; - inputCol=1; -else - abundance(:,1)=regexprep(abundance(:,1),'pan','','once'); - inputTaxa={}; - for i=2:size(taxonomy,2) - taxa=strrep(taxonomy(:,i),' ','_'); - taxa=strrep(taxa,'.','_'); - taxa=strrep(taxa,'/','_'); - taxa=strrep(taxa,'-','_'); - taxa=strrep(taxa,'__','_'); - if length(intersect(abundance(2:end,1),taxa))==size(abundance,1)-1 - inputTaxa=taxa; - inputCol=i; - end - end -end -if isempty(inputTaxa) - error('Some taxa in the abundance file are not found in the taxonomy file!') -end - -for i = 2:size(abundance, 2) - %% calculate reaction abundance for the samples one by one - fprintf(['Calculating reaction abundance for sample ', num2str(i - 1), ' of ' num2str(size(abundance, 2) - 1) '.. \n']) - ReactionAbundance.('Total'){i, 1} = abundance{1, i}; - if ~isempty(taxonomy) - for t = 1:size(TaxonomyLevels, 1) - ReactionAbundance.(TaxonomyLevels{t, 1}){i, 1} = abundance{1, i}; - end - end - % use parallel pool if workers specified as input - if exist('numWorkers', 'var') && numWorkers > 0 - poolobj = gcp('nocreate'); - if isempty(poolobj) - parpool(numWorkers) - end - % create tables in which abundances for each individual for - % all reactions/taxa are stored - totalAbun = zeros(length(rxnsList), 1); - if ~isempty(taxonomy) - phylumAbun = zeros(length(rxnsList), length(TaxonomyLevels{1, 2})); - classAbun = zeros(length(rxnsList), length(TaxonomyLevels{2, 2})); - orderAbun = zeros(length(rxnsList), length(TaxonomyLevels{3, 2})); - familyAbun = zeros(length(rxnsList), length(TaxonomyLevels{4, 2})); - genusAbun = zeros(length(rxnsList), length(TaxonomyLevels{5, 2})); - speciesAbun = zeros(length(rxnsList), length(TaxonomyLevels{6, 2})); - end - parfor j = 1:length(rxnsList) - % store the abundance for each reaction and taxon separately in a - % temporary file to enable parallellization - if ~isempty(taxonomy) - tmpPhyl = zeros(length(rxnsList), length(TaxonomyLevels{1, 2})); - tmpClass = zeros(length(rxnsList), length(TaxonomyLevels{2, 2})); - tmpOrder = zeros(length(rxnsList), length(TaxonomyLevels{3, 2})); - tmpFamily = zeros(length(rxnsList), length(TaxonomyLevels{4, 2})); - tmpGenus = zeros(length(rxnsList), length(TaxonomyLevels{5, 2})); - tmpSpecies = zeros(length(rxnsList), length(TaxonomyLevels{6, 2})); - end - for k = 2:size(abundance, 1) - % check if the reaction is present in the strain - if ReactionPresence{k, j + 1} == 1 - % calculate total abundance - totalAbun(j) = totalAbun(j) + str2double(abundance{k, i}); - if ~isempty(taxonomy) - % calculate phylum abundance - t = 1; - findTax = taxonomy(find(strcmp(abundance{k, 1}, inputTaxa)), TaxonomyLevels{t, 3}); - if any(strcmp(findTax{1}, TaxonomyLevels{t, 2})) - taxonCol = find(strcmp(findTax{1}, TaxonomyLevels{t, 2})); - tmpPhyl(1, taxonCol) = tmpPhyl(1, taxonCol) + str2double(abundance{k, i}); - end - % calculate class abundance - t = 2; - findTax = taxonomy(find(strcmp(abundance{k, 1}, inputTaxa)), TaxonomyLevels{t, 3}); - if any(strcmp(findTax{1}, TaxonomyLevels{t, 2})) - taxonCol = find(strcmp(findTax{1}, TaxonomyLevels{t, 2})); - tmpClass(1, taxonCol) = tmpClass(1, taxonCol) + str2double(abundance{k, i}); - end - % calculate order abundance - t = 3; - findTax = taxonomy(find(strcmp(abundance{k, 1}, inputTaxa)), TaxonomyLevels{t, 3}); - if any(strcmp(findTax{1}, TaxonomyLevels{t, 2})) - taxonCol = find(strcmp(findTax{1}, TaxonomyLevels{t, 2})); - tmpOrder(1, taxonCol) = tmpOrder(1, taxonCol) + str2double(abundance{k, i}); - end - % calculate family abundance - t = 4; - findTax = taxonomy(find(strcmp(abundance{k, 1}, inputTaxa)), TaxonomyLevels{t, 3}); - if any(strcmp(findTax{1}, TaxonomyLevels{t, 2})) - taxonCol = find(strcmp(findTax{1}, TaxonomyLevels{t, 2})); - tmpFamily(1, taxonCol) = tmpFamily(1, taxonCol) + str2double(abundance{k, i}); - end - % calculate genus abundance - t = 5; - findTax = taxonomy(find(strcmp(abundance{k, 1}, inputTaxa)), TaxonomyLevels{t, 3}); - if any(strcmp(findTax{1}, TaxonomyLevels{t, 2})) - taxonCol = find(strcmp(findTax{1}, TaxonomyLevels{t, 2})); - tmpGenus(1, taxonCol) = tmpGenus(1, taxonCol) + str2double(abundance{k, i}); - end - % calculate species abundance - t = 6; - findTax = taxonomy(find(strcmp(abundance{k, 1}, inputTaxa)), TaxonomyLevels{t, 3}); - if any(strcmp(findTax{1}, TaxonomyLevels{t, 2})) - taxonCol = find(strcmp(findTax{1}, TaxonomyLevels{t, 2})); - tmpSpecies(1, taxonCol) = tmpSpecies(1, taxonCol) + str2double(abundance{k, i}); - end - end - end - end - if ~isempty(taxonomy) - phylumAbun(j, :) = tmpPhyl(1, :); - classAbun(j, :) = tmpClass(1, :); - orderAbun(j, :) = tmpOrder(1, :); - familyAbun(j, :) = tmpFamily(1, :); - genusAbun(j, :) = tmpGenus(1, :); - speciesAbun(j, :) = tmpSpecies(1, :); - end - end - else - % create tables in which abundances for each individual for - % all reactions/taxa are stored - % no parallellization-takes longer - totalAbun = zeros(length(rxnsList), 1); - if ~isempty(taxonomy) - phylumAbun = zeros(length(rxnsList), length(TaxonomyLevels{1, 2})); - classAbun = zeros(length(rxnsList), length(TaxonomyLevels{2, 2})); - orderAbun = zeros(length(rxnsList), length(TaxonomyLevels{3, 2})); - familyAbun = zeros(length(rxnsList), length(TaxonomyLevels{4, 2})); - genusAbun = zeros(length(rxnsList), length(TaxonomyLevels{5, 2})); - speciesAbun = zeros(length(rxnsList), length(TaxonomyLevels{6, 2})); - end - for j = 1:length(rxnsList) - for k = 2:size(abundance, 1) - % check if the reaction is present in the strain - if ReactionPresence{k, j + 1} == 1 - % calculate total abundance - totalAbun(j) = totalAbun(j) + str2double(abundance{k, i}); - if ~isempty(taxonomy) - % calculate phylum abundance - t = 1; - findTax = taxonomy(find(strcmp(abundance{k, 1}, inputTaxa)), TaxonomyLevels{t, 3}); - if any(strcmp(findTax{1}, TaxonomyLevels{t, 2})) - taxonCol = find(strcmp(findTax{1}, TaxonomyLevels{t, 2})); - phylumAbun(j, taxonCol) = phylumAbun(j, taxonCol) + str2double(abundance{k, i}); - end - % calculate class abundance - t = 2; - findTax = taxonomy(find(strcmp(abundance{k, 1}, inputTaxa)), TaxonomyLevels{t, 3}); - if any(strcmp(findTax{1}, TaxonomyLevels{t, 2})) - taxonCol = find(strcmp(findTax{1}, TaxonomyLevels{t, 2})); - classAbun(j, taxonCol) = classAbun(j, taxonCol) + str2double(abundance{k, i}); - end - % calculate order abundance - t = 3; - findTax = taxonomy(find(strcmp(abundance{k, 1}, inputTaxa)), TaxonomyLevels{t, 3}); - if any(strcmp(findTax{1}, TaxonomyLevels{t, 2})) - taxonCol = find(strcmp(findTax{1}, TaxonomyLevels{t, 2})); - orderAbun(j, taxonCol) = orderAbun(j, taxonCol) + str2double(abundance{k, i}); - end - % calculate family abundance - t = 4; - findTax = taxonomy(find(strcmp(abundance{k, 1}, inputTaxa)), TaxonomyLevels{t, 3}); - if any(strcmp(findTax{1}, TaxonomyLevels{t, 2})) - taxonCol = find(strcmp(findTax{1}, TaxonomyLevels{t, 2})); - familyAbun(j, taxonCol) = familyAbun(j, taxonCol) + str2double(abundance{k, i}); - end - % calculate genus abundance - t = 5; - findTax = taxonomy(find(strcmp(abundance{k, 1}, inputTaxa)), TaxonomyLevels{t, 3}); - if any(strcmp(findTax{1}, TaxonomyLevels{t, 2})) - taxonCol = find(strcmp(findTax{1}, TaxonomyLevels{t, 2})); - genusAbun(j, taxonCol) = genusAbun(j, taxonCol) + str2double(abundance{k, i}); - end - % calculate species abundance - t = 6; - findTax = taxonomy(find(strcmp(abundance{k, 1}, inputTaxa)), TaxonomyLevels{t, 3}); - if any(strcmp(findTax{1}, TaxonomyLevels{t, 2})) - taxonCol = find(strcmp(findTax{1}, TaxonomyLevels{t, 2})); - speciesAbun(j, taxonCol) = speciesAbun(j, taxonCol) + str2double(abundance{k, i}); - end - end - end - end - end - end - %% store the abundances total and on taxonomic levels calculated for the individual in the output structure - for j = 1:length(rxnsList) - ReactionAbundance.('Total'){i, j + 1} = totalAbun(j); - % abundance on taxon levels - end - if ~isempty(taxonomy) - % phylum abundance - t = 1; - cnt = 2; - for j = 1:length(rxnsList) - for l = 1:length(TaxonomyLevels{t, 2}) - ReactionAbundance.(TaxonomyLevels{t}){i, cnt} = phylumAbun(j, l); - cnt = cnt + 1; - end - end - % class abundance - t = 2; - cnt = 2; - for j = 1:length(rxnsList) - for l = 1:length(TaxonomyLevels{t, 2}) - ReactionAbundance.(TaxonomyLevels{t}){i, cnt} = classAbun(j, l); - cnt = cnt + 1; - end - end - % order abundance - t = 3; - cnt = 2; - for j = 1:length(rxnsList) - for l = 1:length(TaxonomyLevels{t, 2}) - ReactionAbundance.(TaxonomyLevels{t}){i, cnt} = orderAbun(j, l); - cnt = cnt + 1; - end - end - % family abundance - t = 4; - cnt = 2; - for j = 1:length(rxnsList) - for l = 1:length(TaxonomyLevels{t, 2}) - ReactionAbundance.(TaxonomyLevels{t}){i, cnt} = familyAbun(j, l); - cnt = cnt + 1; - end - end - % genus abundance - t = 5; - cnt = 2; - for j = 1:length(rxnsList) - for l = 1:length(TaxonomyLevels{t, 2}) - ReactionAbundance.(TaxonomyLevels{t}){i, cnt} = genusAbun(j, l); - cnt = cnt + 1; - end - end - % species abundance - t = 6; - cnt = 2; - for j = 1:length(rxnsList) - for l = 1:length(TaxonomyLevels{t, 2}) - ReactionAbundance.(TaxonomyLevels{t}){i, cnt} = speciesAbun(j, l); - cnt = cnt + 1; - end - end - end -end - -% finally, delete empty columns to avoid unneccessarily big file sizes -fprintf('Finalizing the output file... \n') - -fNames = fieldnames(ReactionAbundance); -for i = 1:length(fNames) - cnt = 1; - delArray = []; - for j = 2:size(ReactionAbundance.(fNames{i}), 2) - cValues = string(ReactionAbundance.(fNames{i})(2:end, j)); - cTotal = sum(str2double(cValues)); - if cTotal < 0.000000001 - delArray(1, cnt) = j; - cnt = cnt + 1; - end - end - if ~isempty(delArray) - ReactionAbundance.(fNames{i})(:, delArray) = []; - end -end - -% export taxonomical information -taxonCol = 'Phylum'; -% remove unnecessary columns -taxonomy(:,taxonCol+1:end)=[]; - -for t = 2:size(TaxonomyLevels, 1) - taxa=ReactionAbundance.(TaxonomyLevels{t})(2:end,1); - TaxonomyReduced=taxonomy; - taxonCol = find(strcmp(taxonomy(1, :), TaxonomyLevels{t})); - TaxonomyReduced(:,1:taxonCol-1)=[]; - % remove duplicate entries - [C,IA] = unique(TaxonomyReduced(:,1),'stable'); - % remove unclassified taxa - findUncl=find(contains(C,'unclassified')); - IA(findUncl,:)=[]; - TaxonomyInfo.(TaxonomyLevels{t})=TaxonomyReduced(IA,:); -end - -% Plot the calculated reaction abundances. -for i = 1:length(fNames) - xlabels = ReactionAbundance.(fNames{i})(1, 2:end); - ylabels = ReactionAbundance.(fNames{i})(2:end, 1); - data = string(ReactionAbundance.(fNames{i})(2:end, 2:end)); - data = str2double(data); - figure; - imagesc(data) - colormap('hot') - colorbar - if length(xlabels) < 50 - set(gca, 'xtick', 1:length(xlabels)); - xticklabels(xlabels); - xtickangle(90) - end - if length(ylabels) < 50 - set(gca, 'ytick', 1:length(ylabels)); - yticklabels(ylabels); - end - set(gca, 'TickLabelInterpreter', 'none'); - title(fNames{i}) -end - -end +function [ReactionAbundance,TaxonomyInfo] = calculateReactionAbundance(abundancePath, modelPath, infoFilePath, rxnsList, numWorkers) +% Part of the Microbiome Modeling Toolbox. This function calculates and +% plots the total abundance of reactions of interest in a given microbiome +% sample based on the strain-level composition. +% Reaction presence or absence in each strain is derived from the reaction content +% of the respective AGORA model. Two results are given: the total abundance, +% and the abundance on different taxonomical levels. +% +% USAGE +% +% [ReactionAbundance,TaxonomyInfo] = calculateReactionAbundance(abundancePath, modelPath, infoFilePath, rxnsList, numWorkers) +% +% INPUTS: +% abundancePath: Path to the .csv file with the abundance data. +% Example: 'cobratoolbox/papers/018_microbiomeModelingToolbox/examples/normCoverage.csv' +% modelPath: Folder containing the strain-specific AGORA models +% OPTIONAL INPUTS: +% infoFilePath: Path to the spreadsheet with the taxonomy +% information on organisms (default: +% AGORA_infoFile.xlsx) +% rxnsList: List of reactions for which the abundance +% should be calculated (if left empty: all +% reactions in all models) +% numWorkers: Number of workers used for parallel pool. If +% left empty, the parallel pool will not be +% started. Parallellization is recommended if +% all reactions are computed. +% +% OUTPUT: +% ReactionAbundance Structure with abundance for each microbiome +% and reaction in total and on taxon levels +% TaxonomyInfo: Taxonomical information on each taxon level +% +% .. Author: - Almut Heinken, 03/2018 +% 10/2018: changed input to location of the csv file with the +% abundance data +% 01/2020: adapted to be suitable for pan-models + +% read the csv file with the abundance data +abundance = readtable(abundancePath, 'ReadVariableNames', false); +abundance = table2cell(abundance); +if isnumeric(abundance{2, 1}) + abundance(:, 1) = []; +end + +% + +% load the models +for i = 2:size(abundance, 1) + model = readCbModel([modelPath filesep abundance{i, 1} '.mat']); + modelsList{i, 1} = model; +end + +if ~exist('rxnsList', 'var') || isempty(rxnsList) % define reaction list if not entered + fprintf('No reaction list entered. Abundances will be calculated for all reactions in all models. \n') + % get model list from abundance input file + for i = 2:size(abundance, 1) + model = modelsList{i, 1}; + rxnsList = vertcat(model.rxns, rxnsList); + end + rxnsList = unique(rxnsList); +end + +% Get the taxonomy information +if exist('infoFilePath','var') && ~isempty(infoFilePath) + taxonomy = readtable(infoFilePath, 'ReadVariableNames', false); + taxonomy = table2cell(taxonomy); +else + taxonomy = readtable('AGORA_infoFile.xlsx', 'ReadVariableNames', false); + taxonomy = table2cell(taxonomy); +end + +% load the models found in the individuals and extract which reactions are +% in which model +for i = 2:size(abundance, 1) + model = modelsList{i, 1}; + ReactionPresence{i, 1} = abundance{i, 1}; + for j = 1:length(rxnsList) + ReactionPresence{1, j + 1} = rxnsList{j}; + if ~isempty(find(ismember(model.rxns, rxnsList{j}))) + ReactionPresence{i, j + 1} = 1; + else + ReactionPresence{i, j + 1} = 0; + end + end +end + +% put together a Matlab structure of the results +ReactionAbundance = struct; + +% prepare table for the total abundance +for j = 1:length(rxnsList) + ReactionAbundance.('Total'){1, j + 1} = rxnsList{j}; +end + +TaxonomyLevels = { + 'Phylum' + 'Class' + 'Order' + 'Family' + 'Genus' + 'Species' + }; +% extract the list of entries on each taxonomical level +for t = 1:size(TaxonomyLevels, 1) + % find the columns corresponding to each taxonomy level and the list of + % unique taxa + taxonCol = find(strcmp(taxonomy(1, :), TaxonomyLevels{t})); + % find and save all entries + taxa = unique(taxonomy(2:end, taxonCol)); + % exclude unclassified entries + taxa(strncmp('unclassified', taxa, taxonCol)) = []; + TaxonomyLevels{t, 2} = taxa; + % define the correct columns in taxonomy table + TaxonomyLevels{t, 3} = taxonCol; + % prepare table for the abundance on taxon levels + cnt = 2; + for j = 1:length(rxnsList) + for l = 1:length(TaxonomyLevels{t, 2}) + ReactionAbundance.(TaxonomyLevels{t, 1}){1, cnt} = strcat(TaxonomyLevels{t, 2}{l}, '_', rxnsList{j}); + cnt = cnt + 1; + end + end +end + +% Find the right column for the input data (strains, species,..) +taxa=taxonomy(2:end,1); +if length(intersect(abundance(2:end,1),taxa))==size(abundance,1)-1 + inputTaxa=taxa; + inputCol=1; +else + abundance(:,1)=regexprep(abundance(:,1),'pan','','once'); + inputTaxa={}; + for i=2:size(taxonomy,2) + taxa=strrep(taxonomy(:,i),' ','_'); + taxa=strrep(taxa,'.','_'); + taxa=strrep(taxa,'/','_'); + taxa=strrep(taxa,'-','_'); + taxa=strrep(taxa,'__','_'); + if length(intersect(abundance(2:end,1),taxa))==size(abundance,1)-1 + inputTaxa=taxa; + inputCol=i; + end + end +end +if isempty(inputTaxa) + error('Some taxa in the abundance file are not found in the taxonomy file!') +end + +for i = 2:size(abundance, 2) + %% calculate reaction abundance for the samples one by one + fprintf(['Calculating reaction abundance for sample ', num2str(i - 1), ' of ' num2str(size(abundance, 2) - 1) '.. \n']) + ReactionAbundance.('Total'){i, 1} = abundance{1, i}; + for t = 1:size(TaxonomyLevels, 1) + ReactionAbundance.(TaxonomyLevels{t, 1}){i, 1} = abundance{1, i}; + end + % use parallel pool if workers specified as input + if exist('numWorkers', 'var') && numWorkers > 0 + poolobj = gcp('nocreate'); + if isempty(poolobj) + parpool(numWorkers) + end + end + % create tables in which abundances for each individual for + % all reactions/taxa are stored + totalAbun = zeros(length(rxnsList), 1); + phylumAbun = zeros(length(rxnsList), length(TaxonomyLevels{1, 2})); + classAbun = zeros(length(rxnsList), length(TaxonomyLevels{2, 2})); + orderAbun = zeros(length(rxnsList), length(TaxonomyLevels{3, 2})); + familyAbun = zeros(length(rxnsList), length(TaxonomyLevels{4, 2})); + genusAbun = zeros(length(rxnsList), length(TaxonomyLevels{5, 2})); + speciesAbun = zeros(length(rxnsList), length(TaxonomyLevels{6, 2})); + + parfor j = 1:length(rxnsList) + + % store the abundance for each reaction and taxon separately in a + % temporary file to enable parallellization + tmpPhyl = zeros(length(rxnsList), length(TaxonomyLevels{1, 2})); + tmpClass = zeros(length(rxnsList), length(TaxonomyLevels{2, 2})); + tmpOrder = zeros(length(rxnsList), length(TaxonomyLevels{3, 2})); + tmpFamily = zeros(length(rxnsList), length(TaxonomyLevels{4, 2})); + tmpGenus = zeros(length(rxnsList), length(TaxonomyLevels{5, 2})); + tmpSpecies = zeros(length(rxnsList), length(TaxonomyLevels{6, 2})); + + for k = 2:size(abundance, 1) + % check if the reaction is present in the strain + if ReactionPresence{k, j + 1} == 1 + % calculate total abundance + totalAbun(j) = totalAbun(j) + str2double(abundance{k, i}); + % calculate phylum abundance + t = 1; + findTax = taxonomy(find(strcmp(abundance{k, 1}, inputTaxa)), TaxonomyLevels{t, 3}); + if any(strcmp(findTax{1}, TaxonomyLevels{t, 2})) + taxonCol = find(strcmp(findTax{1}, TaxonomyLevels{t, 2})); + tmpPhyl(1, taxonCol) = tmpPhyl(1, taxonCol) + str2double(abundance{k, i}); + end + % calculate class abundance + t = 2; + findTax = taxonomy(find(strcmp(abundance{k, 1}, inputTaxa)), TaxonomyLevels{t, 3}); + if any(strcmp(findTax{1}, TaxonomyLevels{t, 2})) + taxonCol = find(strcmp(findTax{1}, TaxonomyLevels{t, 2})); + tmpClass(1, taxonCol) = tmpClass(1, taxonCol) + str2double(abundance{k, i}); + end + % calculate order abundance + t = 3; + findTax = taxonomy(find(strcmp(abundance{k, 1}, inputTaxa)), TaxonomyLevels{t, 3}); + if any(strcmp(findTax{1}, TaxonomyLevels{t, 2})) + taxonCol = find(strcmp(findTax{1}, TaxonomyLevels{t, 2})); + tmpOrder(1, taxonCol) = tmpOrder(1, taxonCol) + str2double(abundance{k, i}); + end + % calculate family abundance + t = 4; + findTax = taxonomy(find(strcmp(abundance{k, 1}, inputTaxa)), TaxonomyLevels{t, 3}); + if any(strcmp(findTax{1}, TaxonomyLevels{t, 2})) + taxonCol = find(strcmp(findTax{1}, TaxonomyLevels{t, 2})); + tmpFamily(1, taxonCol) = tmpFamily(1, taxonCol) + str2double(abundance{k, i}); + end + % calculate genus abundance + t = 5; + findTax = taxonomy(find(strcmp(abundance{k, 1}, inputTaxa)), TaxonomyLevels{t, 3}); + if any(strcmp(findTax{1}, TaxonomyLevels{t, 2})) + taxonCol = find(strcmp(findTax{1}, TaxonomyLevels{t, 2})); + tmpGenus(1, taxonCol) = tmpGenus(1, taxonCol) + str2double(abundance{k, i}); + end + % calculate species abundance + t = 6; + findTax = taxonomy(find(strcmp(abundance{k, 1}, inputTaxa)), TaxonomyLevels{t, 3}); + if any(strcmp(findTax{1}, TaxonomyLevels{t, 2})) + taxonCol = find(strcmp(findTax{1}, TaxonomyLevels{t, 2})); + tmpSpecies(1, taxonCol) = tmpSpecies(1, taxonCol) + str2double(abundance{k, i}); + end + end + end + phylumAbun(j, :) = tmpPhyl(1, :); + classAbun(j, :) = tmpClass(1, :); + orderAbun(j, :) = tmpOrder(1, :); + familyAbun(j, :) = tmpFamily(1, :); + genusAbun(j, :) = tmpGenus(1, :); + speciesAbun(j, :) = tmpSpecies(1, :); + end + %% store the abundances total and on taxonomic levels calculated for the individual in the output structure + for j = 1:length(rxnsList) + ReactionAbundance.('Total'){i, j + 1} = totalAbun(j); + % abundance on taxon levels + end + % phylum abundance + t = 1; + cnt = 2; + for j = 1:length(rxnsList) + for l = 1:length(TaxonomyLevels{t, 2}) + ReactionAbundance.(TaxonomyLevels{t}){i, cnt} = phylumAbun(j, l); + cnt = cnt + 1; + end + end + % class abundance + t = 2; + cnt = 2; + for j = 1:length(rxnsList) + for l = 1:length(TaxonomyLevels{t, 2}) + ReactionAbundance.(TaxonomyLevels{t}){i, cnt} = classAbun(j, l); + cnt = cnt + 1; + end + end + % order abundance + t = 3; + cnt = 2; + for j = 1:length(rxnsList) + for l = 1:length(TaxonomyLevels{t, 2}) + ReactionAbundance.(TaxonomyLevels{t}){i, cnt} = orderAbun(j, l); + cnt = cnt + 1; + end + end + % family abundance + t = 4; + cnt = 2; + for j = 1:length(rxnsList) + for l = 1:length(TaxonomyLevels{t, 2}) + ReactionAbundance.(TaxonomyLevels{t}){i, cnt} = familyAbun(j, l); + cnt = cnt + 1; + end + end + % genus abundance + t = 5; + cnt = 2; + for j = 1:length(rxnsList) + for l = 1:length(TaxonomyLevels{t, 2}) + ReactionAbundance.(TaxonomyLevels{t}){i, cnt} = genusAbun(j, l); + cnt = cnt + 1; + end + end + % species abundance + t = 6; + cnt = 2; + for j = 1:length(rxnsList) + for l = 1:length(TaxonomyLevels{t, 2}) + ReactionAbundance.(TaxonomyLevels{t}){i, cnt} = speciesAbun(j, l); + cnt = cnt + 1; + end + end +end + +% finally, delete empty columns to avoid unneccessarily big file sizes +fprintf('Finalizing the output file... \n') + +fNames = fieldnames(ReactionAbundance); +for i = 1:length(fNames) + cValues = string(ReactionAbundance.(fNames{i})(2:end, 2:end)); + rownames=ReactionAbundance.(fNames{i})(:,1); + ReactionAbundance.(fNames{i})(:,1)=[]; + cTotal = sum(str2double(cValues),1); + ReactionAbundance.(fNames{i})(:,find(cTotal<0.000000001))=[]; + ReactionAbundance.(fNames{i})=[rownames,ReactionAbundance.(fNames{i})]; + ReactionAbundance.(fNames{i}){1,1}='Samples'; +end + +% export taxonomical information +taxonCol = 'Phylum'; +% remove unnecessary columns +taxonomy(:,taxonCol+1:end)=[]; + +for t = 2:size(TaxonomyLevels, 1) + taxa=ReactionAbundance.(TaxonomyLevels{t})(2:end,1); + TaxonomyReduced=taxonomy; + taxonCol = find(strcmp(taxonomy(1, :), TaxonomyLevels{t})); + TaxonomyReduced(:,1:taxonCol-1)=[]; + % remove duplicate entries + [C,IA] = unique(TaxonomyReduced(:,1),'stable'); + % remove unclassified taxa + findUncl=find(contains(C,'unclassified')); + IA(findUncl,:)=[]; + TaxonomyInfo.(TaxonomyLevels{t})=TaxonomyReduced(IA,:); +end + +% Plot the calculated reaction abundances. +for i = 1:length(fNames) + xlabels = ReactionAbundance.(fNames{i})(2:end,1); + ylabels = ReactionAbundance.(fNames{i})(1,2:end); + data = string(ReactionAbundance.(fNames{i})(2:end, 2:end)); + data = str2double(data); + figure; + imagesc(data') + colormap('hot') + colorbar + if length(xlabels) < 50 + set(gca, 'xtick', 1:length(xlabels)); + xticklabels(xlabels); + xtickangle(90) + end + if length(ylabels) < 50 + set(gca, 'ytick', 1:length(ylabels)); + yticklabels(ylabels); + end + set(gca, 'TickLabelInterpreter', 'none'); + title(fNames{i}) +end + +end \ No newline at end of file diff --git a/src/analysis/multiSpecies/microbiomeModelingToolbox/additionalAnalysis/calculateSubsystemAbundance.m b/src/analysis/multiSpecies/microbiomeModelingToolbox/additionalAnalysis/calculateSubsystemAbundance.m index ae3a5b1083..090b5bd7f1 100644 --- a/src/analysis/multiSpecies/microbiomeModelingToolbox/additionalAnalysis/calculateSubsystemAbundance.m +++ b/src/analysis/multiSpecies/microbiomeModelingToolbox/additionalAnalysis/calculateSubsystemAbundance.m @@ -17,7 +17,7 @@ % AUTHOR % - Almut Heinken, 08/2020 -reactionDatabase = readtable('reactionDatabase.txt', 'Delimiter', 'tab','TreatAsEmpty',['UND. -60001','UND. -2011','UND. -62011'], 'ReadVariableNames', false); +reactionDatabase = readtable('ReactionDatabase.txt', 'Delimiter', 'tab','TreatAsEmpty',['UND. -60001','UND. -2011','UND. -62011'], 'ReadVariableNames', false); reactionDatabase=table2cell(reactionDatabase); reactionAbundance = readtable(reactionAbundancePath, 'ReadVariableNames', false); @@ -54,6 +54,4 @@ end end -writetable(cell2table(subsystemAbundance),'SubsystemAbundance.txt','FileType','text','WriteVariableNames',false,'Delimiter','\t'); - end \ No newline at end of file diff --git a/src/analysis/multiSpecies/microbiomeModelingToolbox/additionalAnalysis/fastCalculateReactionAbundance.m b/src/analysis/multiSpecies/microbiomeModelingToolbox/additionalAnalysis/fastCalculateReactionAbundance.m new file mode 100644 index 0000000000..48e4acdcc6 --- /dev/null +++ b/src/analysis/multiSpecies/microbiomeModelingToolbox/additionalAnalysis/fastCalculateReactionAbundance.m @@ -0,0 +1,119 @@ +function ReactionAbundance = fastCalculateReactionAbundance(abundancePath, modelPath, rxnsList, numWorkers) +% Part of the Microbiome Modeling Toolbox. This function calculates and +% plots the total abundance of reactions of interest in a given microbiome +% sample based on the strain-level composition. +% Reaction presence or absence in each strain is derived from the reaction content +% of the respective AGORA model. +% +% USAGE +% +% ReactionAbundance = fastCalculateReactionAbundance(abundancePath, modelPath, rxnsList, numWorkers) +% +% INPUTS: +% abundancePath: Path to the .csv file with the abundance data. +% Example: 'cobratoolbox/papers/018_microbiomeModelingToolbox/examples/normCoverage.csv' +% modelPath: Folder containing the strain-specific AGORA models +% OPTIONAL INPUTS: +% rxnsList: List of reactions for which the abundance +% should be calculated (if left empty: all +% reactions in all models) +% numWorkers: Number of workers used for parallel pool. If +% left empty, the parallel pool will not be +% started. Parallellization is recommended if +% all reactions are computed. +% +% OUTPUT: +% ReactionAbundance Table with total abundance for each microbiome +% and reaction +% +% .. Author: - Almut Heinken, 04/2021 + +% read the csv file with the abundance data +abundance = readtable(abundancePath, 'ReadVariableNames', false); +abundance = table2cell(abundance); +if isnumeric(abundance{2, 1}) + abundance(:, 1) = []; +end + +% load the models +for i = 2:size(abundance, 1) + model = readCbModel([modelPath filesep abundance{i, 1} '.mat']); + modelsList{i, 1} = model; +end + +if ~exist('rxnsList', 'var') || isempty(rxnsList) % define reaction list if not entered + fprintf('No reaction list entered. Abundances will be calculated for all reactions in all models. \n') + % get model list from abundance input file + for i = 2:size(abundance, 1) + model = modelsList{i, 1}; + rxnsList = vertcat(model.rxns, rxnsList); + end + rxnsList = unique(rxnsList); +end + +% load the models found in the individuals and extract which reactions are +% in which model +for i = 2:size(abundance, 1) + model = modelsList{i, 1}; + ReactionPresence{i, 1} = abundance{i, 1}; + for j = 1:length(rxnsList) + ReactionPresence{1, j + 1} = rxnsList{j}; + if ~isempty(find(ismember(model.rxns, rxnsList{j}))) + ReactionPresence{i, j + 1} = '1'; + else + ReactionPresence{i, j + 1} = '0'; + end + end +end +ReactionPresence{1,1}='Strains'; + + +% prepare table for the total abundance +ReactionAbundance = {}; +for i = 1:length(rxnsList) + ReactionAbundance{1, i + 1} = rxnsList{i}; +end +for i = 2:size(abundance, 2) + ReactionAbundance{i, 1} = abundance{1, i}; +end + +% use parallel pool if workers specified as input +if exist('numWorkers', 'var') && numWorkers > 0 + poolobj = gcp('nocreate'); + if isempty(poolobj) + parpool(numWorkers) + end +end + +clear abundance + +totalAbun={}; +parfor i = 2:size(ReactionAbundance, 1) + i + % reload the file to avoid running out of memory + abundance = readtable(abundancePath, 'ReadVariableNames', false); + abundance = table2cell(abundance); + if isnumeric(abundance{2, 1}) + abundance(:, 1) = []; + end + + % temporarily store reaction abundances + totalAbun{i} = zeros(length(rxnsList), 1); + + for j = 2:size(abundance, 1) + % find all reactions present in the strain + presentRxns = find(strcmp(ReactionPresence(j,2:end),'1')); + + for k = 1:length(presentRxns) + % summarize total abundance + totalAbun{i}(presentRxns(k),1) = totalAbun{i}(presentRxns(k),1) + str2double(abundance{j,i}); + end + end +end + +% collect the temporarily stored abundances to put together the table +for i = 2:size(ReactionAbundance, 1) + ReactionAbundance(i,2:end) = num2cell(totalAbun{i}); +end + +end \ No newline at end of file diff --git a/src/analysis/multiSpecies/microbiomeModelingToolbox/mgPipe/adaptVMHDietToAGORA.m b/src/analysis/multiSpecies/microbiomeModelingToolbox/mgPipe/adaptVMHDietToAGORA.m index cd991c1355..a8bcf706da 100755 --- a/src/analysis/multiSpecies/microbiomeModelingToolbox/mgPipe/adaptVMHDietToAGORA.m +++ b/src/analysis/multiSpecies/microbiomeModelingToolbox/mgPipe/adaptVMHDietToAGORA.m @@ -51,7 +51,7 @@ % Define the list of metabolites required by at least one AGORA model for % growth -essentialMetabolites = {'EX_12dgr180(e)'; 'EX_26dap_M(e)'; 'EX_2dmmq8(e)'; 'EX_2obut(e)'; 'EX_3mop(e)'; 'EX_4abz(e)'; 'EX_4hbz(e)'; 'EX_ac(e)'; 'EX_acgam(e)'; 'EX_acmana(e)'; 'EX_acnam(e)'; 'EX_ade(e)'; 'EX_adn(e)'; 'EX_adocbl(e)'; 'EX_adpcbl(e)'; 'EX_ala_D(e)'; 'EX_ala_L(e)'; 'EX_amet(e)'; 'EX_amp(e)'; 'EX_arab_D(e)'; 'EX_arab_L(e)'; 'EX_arg_L(e)'; 'EX_asn_L(e)'; 'EX_btn(e)'; 'EX_ca2(e)'; 'EX_cbl1(e)'; 'EX_cgly(e)'; 'EX_chor(e)'; 'EX_chsterol(e)'; 'EX_cit(e)'; 'EX_cl(e)'; 'EX_cobalt2(e)'; 'EX_csn(e)'; 'EX_cu2(e)'; 'EX_cys_L(e)'; 'EX_cytd(e)'; 'EX_dad_2(e)'; 'EX_dcyt(e)'; 'EX_ddca(e)'; 'EX_dgsn(e)'; 'EX_fald(e)'; 'EX_fe2(e)'; 'EX_fe3(e)'; 'EX_fol(e)'; 'EX_for(e)'; 'EX_gal(e)'; 'EX_glc_D(e)'; 'EX_gln_L(e)'; 'EX_glu_L(e)'; 'EX_gly(e)'; 'EX_glyc(e)'; 'EX_glyc3p(e)'; 'EX_gsn(e)'; 'EX_gthox(e)'; 'EX_gthrd(e)'; 'EX_gua(e)'; 'EX_h(e)'; 'EX_h2o(e)'; 'EX_h2s(e)'; 'EX_his_L(e)'; 'EX_hxan(e)'; 'EX_ile_L(e)'; 'EX_k(e)'; 'EX_lanost(e)'; 'EX_leu_L(e)'; 'EX_lys_L(e)'; 'EX_malt(e)'; 'EX_met_L(e)'; 'EX_mg2(e)'; 'EX_mn2(e)'; 'EX_mqn7(e)'; 'EX_mqn8(e)'; 'EX_nac(e)'; 'EX_ncam(e)'; 'EX_nmn(e)'; 'EX_no2(e)'; 'EX_ocdca(e)'; 'EX_ocdcea(e)'; 'EX_orn(e)'; 'EX_phe_L(e)'; 'EX_pheme(e)'; 'EX_pi(e)'; 'EX_pnto_R(e)'; 'EX_pro_L(e)'; 'EX_ptrc(e)'; 'EX_pydx(e)'; 'EX_pydxn(e)'; 'EX_q8(e)'; 'EX_rib_D(e)'; 'EX_ribflv(e)'; 'EX_ser_L(e)'; 'EX_sheme(e)'; 'EX_so4(e)'; 'EX_spmd(e)'; 'EX_thm(e)'; 'EX_thr_L(e)'; 'EX_thymd(e)'; 'EX_trp_L(e)'; 'EX_ttdca(e)'; 'EX_tyr_L(e)'; 'EX_ura(e)'; 'EX_val_L(e)'; 'EX_xan(e)'; 'EX_xyl_D(e)'; 'EX_zn2(e)'; 'EX_glu_D(e)'; 'EX_melib(e)'; 'EX_chtbs(e)'; 'EX_metsox_S_L(e)'; 'EX_hdca(e)'; 'EX_gam(e)'; 'EX_indole(e)'; 'EX_glcn(e)'; 'EX_coa(e)'}; +essentialMetabolites = {'EX_12dgr180(e)'; 'EX_26dap_M(e)'; 'EX_2dmmq8(e)'; 'EX_2obut(e)'; 'EX_3mop(e)'; 'EX_4abz(e)'; 'EX_4hbz(e)'; 'EX_ac(e)'; 'EX_acgam(e)'; 'EX_acmana(e)'; 'EX_acnam(e)'; 'EX_ade(e)'; 'EX_adn(e)'; 'EX_adocbl(e)'; 'EX_ala_D(e)'; 'EX_ala_L(e)'; 'EX_amet(e)'; 'EX_amp(e)'; 'EX_arab_D(e)'; 'EX_arab_L(e)'; 'EX_arg_L(e)'; 'EX_asn_L(e)'; 'EX_btn(e)'; 'EX_ca2(e)'; 'EX_cbl1(e)'; 'EX_cgly(e)'; 'EX_chor(e)'; 'EX_chsterol(e)'; 'EX_cit(e)'; 'EX_cl(e)'; 'EX_cobalt2(e)'; 'EX_csn(e)'; 'EX_cu2(e)'; 'EX_cys_L(e)'; 'EX_cytd(e)'; 'EX_dad_2(e)'; 'EX_dcyt(e)'; 'EX_ddca(e)'; 'EX_dgsn(e)'; 'EX_fald(e)'; 'EX_fe2(e)'; 'EX_fe3(e)'; 'EX_fol(e)'; 'EX_for(e)'; 'EX_gal(e)'; 'EX_glc_D(e)'; 'EX_gln_L(e)'; 'EX_glu_L(e)'; 'EX_gly(e)'; 'EX_glyc(e)'; 'EX_glyc3p(e)'; 'EX_gsn(e)'; 'EX_gthox(e)'; 'EX_gthrd(e)'; 'EX_gua(e)'; 'EX_h(e)'; 'EX_h2o(e)'; 'EX_h2s(e)'; 'EX_his_L(e)'; 'EX_hxan(e)'; 'EX_ile_L(e)'; 'EX_k(e)'; 'EX_lanost(e)'; 'EX_leu_L(e)'; 'EX_lys_L(e)'; 'EX_malt(e)'; 'EX_met_L(e)'; 'EX_mg2(e)'; 'EX_mn2(e)'; 'EX_mqn7(e)'; 'EX_mqn8(e)'; 'EX_nac(e)'; 'EX_ncam(e)'; 'EX_nmn(e)'; 'EX_no2(e)'; 'EX_ocdca(e)'; 'EX_ocdcea(e)'; 'EX_orn(e)'; 'EX_phe_L(e)'; 'EX_pheme(e)'; 'EX_pi(e)'; 'EX_pnto_R(e)'; 'EX_pro_L(e)'; 'EX_ptrc(e)'; 'EX_pydx(e)'; 'EX_pydxn(e)'; 'EX_q8(e)'; 'EX_rib_D(e)'; 'EX_ribflv(e)'; 'EX_ser_L(e)'; 'EX_sheme(e)'; 'EX_so4(e)'; 'EX_spmd(e)'; 'EX_thm(e)'; 'EX_thr_L(e)'; 'EX_thymd(e)'; 'EX_trp_L(e)'; 'EX_ttdca(e)'; 'EX_tyr_L(e)'; 'EX_ura(e)'; 'EX_val_L(e)'; 'EX_xan(e)'; 'EX_xyl_D(e)'; 'EX_zn2(e)'; 'EX_glu_D(e)'; 'EX_melib(e)'; 'EX_chtbs(e)'; 'EX_metsox_S_L(e)'; 'EX_hdca(e)'; 'EX_gam(e)'; 'EX_indole(e)'; 'EX_glcn(e)'; 'EX_coa(e)'; 'EX_man(e)'; 'EX_fum(e)'; 'EX_succ(e)'; 'EX_no3(e)'; 'EX_ins(e)'; 'EX_uri(e)'; 'EX_drib(e)'; 'EX_pime(e)'; 'EX_lac_L(e)'; 'EX_glypro(e)'; 'EX_urea(e)'; 'EX_duri(e)'; 'EX_h2(e)'; 'EX_mal_L(e)'; 'EX_tre(e)'; 'EX_orot(e)'}; % fix any exchange nomenclature issues adaptedDietConstraints(:, 1) = strrep(adaptedDietConstraints(:, 1), '[e]', '(e)'); diff --git a/src/analysis/multiSpecies/microbiomeModelingToolbox/mgPipe/buildModelStorage.m b/src/analysis/multiSpecies/microbiomeModelingToolbox/mgPipe/buildModelStorage.m new file mode 100644 index 0000000000..5ae37cfa92 --- /dev/null +++ b/src/analysis/multiSpecies/microbiomeModelingToolbox/mgPipe/buildModelStorage.m @@ -0,0 +1,78 @@ +function [exch,modelStoragePath] = buildModelStorage(microbeNames,modPath) + +currentDir=pwd; +mkdir('modelStorage') +cd('modelStorage') +modelStoragePath = pwd; + +exch = {}; +for j = 1:size(microbeNames, 1) + model = readCbModel([modPath filesep microbeNames{j,1} '.mat']); + %exch = union(exch, model.mets(find(sum(model.S(:, strncmp('EX_', model.rxns, 3)), 2) ~= 0))); + exStruct = findSExRxnInd(model); + new_exch = findMetsFromRxns(model,model.rxns(exStruct.ExchRxnBool & ~exStruct.biomassBool)); + exch = union(exch,new_exch); +end + +% get already built reconstructions +dInfo = dir(modelStoragePath); +modelList={dInfo.name}; +modelList=modelList'; +modelList=strrep(modelList,'.mat',''); +microbesNames=setdiff(microbeNames,modelList); + + +if length(microbesNames)>0 + %% create a new extracellular space [u] for microbes + for j = 1:size(microbeNames, 1) + model = readCbModel([modPath filesep microbeNames{j,1} '.mat']); + % temp fix + if isfield(model,'C') + model=rmfield(model,'C'); + model=rmfield(model,'d'); + end + % + + % removing possible constraints of the bacs + selExc = findExcRxns(model); + Reactions2 = model.rxns(find(selExc)); + allex = Reactions2(strmatch('EX', Reactions2)); + biomass = allex(find(strncmp(allex,'bio',3))); + finrex = setdiff(allex, biomass); + model = changeRxnBounds(model, finrex, -1000, 'l'); + model = changeRxnBounds(model, finrex, 1000, 'u'); + + % removing blocked reactions from the bacs + %BlockedRxns = identifyFastBlockedRxns(model,model.rxns, printLevel); + %model= removeRxns(model, BlockedRxns); + %BlockedReaction = findBlockedReaction(model,'L2') + + model = convertOldStyleModel(model); + exmod = model.rxns(strncmp('EX_', model.rxns, 3)); % find exchange reactions + eMets = model.mets(~cellfun(@isempty, strfind(model.mets, '[e]'))); % exchanged metabolites + dummyMicEU = createModel(); + %dummyMicEU = makeDummyModel(2 * size(eMets, 1), size(eMets, 1)); + dummyMicEUmets = [strcat(strcat(microbeNames{j, 1}, '_'), regexprep(eMets, '\[e\]', '\[u\]')); regexprep(eMets, '\[e\]', '\[u\]')]; + dummyMicEU = addMultipleMetabolites(dummyMicEU,dummyMicEUmets); + nMets = numel(eMets); + S = [speye(nMets);-speye(nMets)]; + lbs = repmat(-1000,nMets,1); + ubs = repmat(1000,nMets,1); + names = strcat(strcat(microbeNames{j, 1}, '_'), 'IEX_', regexprep(eMets, '\[e\]', '\[u\]'), 'tr'); + dummyMicEU = addMultipleReactions(dummyMicEU,names,dummyMicEUmets,S,'lb',lbs,'ub',ubs); + model = removeRxns(model, exmod); + model.rxns = strcat(strcat(microbeNames{j, 1}, '_'), model.rxns); + model.mets = strcat(strcat(microbeNames{j, 1}, '_'), regexprep(model.mets, '\[e\]', '\[u\]')); % replace [e] with [u] + [model] = mergeTwoModels(dummyMicEU, model, 2, false, false); + + %finish up by A: removing duplicate reactions + %We will lose information here, but we will just remove the duplicates. + [model,rxnToRemove,rxnToKeep]= checkDuplicateRxn(model,'S',1,0,1); + + writeCbModel(model,'format','mat','fileName',[microbeNames{j,1} '.mat']); % store model + end +end + +cd(currentDir) + +end \ No newline at end of file diff --git a/src/analysis/multiSpecies/microbiomeModelingToolbox/mgPipe/createPersonalizedModel.m b/src/analysis/multiSpecies/microbiomeModelingToolbox/mgPipe/createPersonalizedModel.m index 27cddfe691..b3db602737 100755 --- a/src/analysis/multiSpecies/microbiomeModelingToolbox/mgPipe/createPersonalizedModel.m +++ b/src/analysis/multiSpecies/microbiomeModelingToolbox/mgPipe/createPersonalizedModel.m @@ -28,7 +28,7 @@ createdModels = {}; % use the setup model containing every strain in every sample -parfor k = 1:length(sampNames) +for k = 1:length(sampNames) mgmodel = model; abunRed = abundance(:,k+1); @@ -64,7 +64,7 @@ % Coupling constraints for bacteria for i = 1:length(presBac) IndRxns=find(strncmp(mgmodel.rxns,[presBac{i,1} '_'],length(presBac{i,1})+1));%finding indixes of specific reactions - % find the name of biomass reacion in the microbe model + % find the name of biomass reaction in the microbe model bioRxn=mgmodel.rxns{find(strncmp(mgmodel.rxns,strcat(presBac{i,1},'_bio'),length(char(strcat(presBac{i,1},'_bio')))))}; mgmodel=coupleRxnList2Rxn(mgmodel,mgmodel.rxns(IndRxns(1:length(mgmodel.rxns(IndRxns(:,1)))-1,1)),bioRxn,400,0); %couple the specific reactions end diff --git a/src/analysis/multiSpecies/microbiomeModelingToolbox/mgPipe/fastSetupCreator.m b/src/analysis/multiSpecies/microbiomeModelingToolbox/mgPipe/fastSetupCreator.m index a25377e0e6..750de3eb47 100755 --- a/src/analysis/multiSpecies/microbiomeModelingToolbox/mgPipe/fastSetupCreator.m +++ b/src/analysis/multiSpecies/microbiomeModelingToolbox/mgPipe/fastSetupCreator.m @@ -1,4 +1,4 @@ -function model = fastSetupCreator(modPath, microbeNames, host, objre, numWorkers) +function model = fastSetupCreator(exch, modelStoragePath, microbeNames, host, objre, buildSetupAll) % creates a microbiota model (min 1 microbe) that can be coupled with a host % model. Microbes and host are connected with a lumen compartment [u], host % can secrete metabolites into body fluids [b]. Diet is simulated as uptake @@ -25,35 +25,22 @@ % host: Host COBRA model structure, can be left empty if % there is no host model % objre: char with reaction name of objective function of microbeNames -% numWorkers: integer indicating the number of cores to use for parallelization +% buildSetupAll: boolean indicating the strategy that should be used to +% build personalized models: if true, build a global setup model +% containing all organisms in at least model (default), false: create +% models one by one (recommended for more than ~500 organisms total) % % OUTPUT: % model: COBRA model structure with all models combined % % .. Author: Stefania Magnusdottir and Federico Baldini 2016-2018 - -% set parallel pool -if numWorkers > 1 - poolobj = gcp('nocreate'); - if isempty(poolobj) - parpool(numWorkers) - end -end +% - Almut Heinken, 04/2021: adapted strategy for improved speed. if ~isempty(host) % Get list of all exchanged metabolites %exch = host.mets(find(sum(host.S(:, strncmp('EX_', host.rxns, 3)), 2) ~= 0)); exStruct = findSExRxnInd(host); - exch = findMetsFromRxns(host,host.rxns(exStruct.ExchRxnBool & ~exStruct.biomassBool)); -else - exch = {}; + exch = union(exch,findMetsFromRxns(host,host.rxns(exStruct.ExchRxnBool & ~exStruct.biomassBool))); end - for j = 1:size(microbeNames, 1) - model = readCbModel([modPath filesep microbeNames{j,1} '.mat']); - %exch = union(exch, model.mets(find(sum(model.S(:, strncmp('EX_', model.rxns, 3)), 2) ~= 0))); - exStruct = findSExRxnInd(model); - new_exch = findMetsFromRxns(model,model.rxns(exStruct.ExchRxnBool & ~exStruct.biomassBool)); - exch = union(exch,new_exch); - end % The biomass 'biomass[c]' should not be inserted in the list of exchanges. % Hence it will be removed. @@ -71,20 +58,20 @@ nMets = numel(umets); stoich = [-speye(nMets),-speye(nMets),sparse(nMets,nMets),sparse(nMets,nMets);... - sparse(nMets,nMets),speye(nMets),-speye(nMets),sparse(nMets,nMets);... - sparse(nMets,nMets),sparse(nMets,nMets),speye(nMets),-speye(nMets)]; + sparse(nMets,nMets),speye(nMets),-speye(nMets),sparse(nMets,nMets);... + sparse(nMets,nMets),sparse(nMets,nMets),speye(nMets),-speye(nMets)]; lbs = [repmat(-1000,nMets,1);zeros(nMets,1);zeros(nMets,1);repmat(-1000,nMets,1)]; ubs = repmat(1000,4*nMets,1); rxnNames = [strcat('EX_',mets(1:nMets));... - strcat('DUt_',strrep(umets,'[e]',''));... - strcat('UFEt_',strrep(umets,'[e]',''));... - strcat('EX_',strrep(umets, '[e]', '[fe]'))]; + strcat('DUt_',strrep(umets,'[e]',''));... + strcat('UFEt_',strrep(umets,'[e]',''));... + strcat('EX_',strrep(umets, '[e]', '[fe]'))]; dummy = addMultipleReactions(dummy,rxnNames,mets,stoich,'lb',lbs,'ub',ubs'); order = [1:nMets;nMets+1:2*nMets;2*nMets+1:3*nMets;3*nMets+1:4*nMets]; order = order(:); dummy = updateFieldOrderForType(dummy,'rxns',order); %Now, we could 'reorder' this reaction list but I'm not sure its necessary. -% +% % cnt = 0; % for j = 1:size(exch, 1) % mdInd = find(ismember(dummy.mets, strrep(exch{j, 1}, '[e]', '[d]'))); @@ -119,175 +106,155 @@ %% create a new extracellular space [b] for host if ~isempty(host) -exMets = find(~cellfun(@isempty, strfind(host.mets, '[e]'))); % find all mets that appear in [e] -exRxns = host.rxns(strncmp('EX_', host.rxns, 3)); % find exchanges in host -exMetRxns = find(sum(abs(host.S(exMets, :)), 1) ~= 0); % find reactions that contain mets from [e] -exMetRxns = exMetRxns'; -exMetRxnsMets = find(sum(abs(host.S(:, exMetRxns)), 2) ~= 0); % get all metabolites of [e] containing rxns -dummyHostB = createModel(); %makeDummyModel(size(exMetRxnsMets, 1), size(exMetRxns, 1)); -dummyHostB = addMultipleMetabolites(dummyHostB,strcat({'Host_'}, regexprep(host.mets(exMetRxnsMets), '\[e\]', '\[b\]'))); -dummyHostB = addMultipleReactions(dummyHostB,strcat({'Host_'}, host.rxns(exMetRxns), {'b'}),dummyHostB.mets,host.S(exMetRxnsMets, exMetRxns),'c',host.c(exMetRxns),'lb',host.lb(exMetRxns),'ub',host.ub(exMetRxns)); -%dummyHostB.rxns = ; -%dummyHostB.mets = strcat({'Host_'}, regexprep(host.mets(exMetRxnsMets), '\[e\]', '\[b\]')); % replace [e] with [b] -%dummyHostB.S = host.S(exMetRxnsMets, exMetRxns); -%dummyHostB.c = host.c(exMetRxns); -%dummyHostB.lb = host.lb(exMetRxns); -%dummyHostB.ub = host.ub(exMetRxns); - - -% remove exchange reactions from host while leaving demand and sink -% reactions -host = removeRxns(host, exRxns); -host.mets = strcat({'Host_'}, host.mets); -host.rxns = strcat({'Host_'}, host.rxns); - -% use mergeToModels without combining genes -[host] = mergeTwoModels(dummyHostB, host, 2, false, false); - -% Change remaining [e] (transporters) to [u] to transport diet metabolites -exMets2 = ~cellfun(@isempty, strfind(host.mets, '[e]')); % again, find all mets that appear in [e] -% exMetRxns2=find(sum(host.S(exMets2,:),1)~=0);%find reactions that contain mets from [e] -% exMetRxns2=exMetRxns2'; -% exMetRxnsMets2=find(sum(host.S(:,exMetRxns2),2)~=0);%get all metabolites of [e] containing rxns -% host.mets=regexprep(host.mets,'\[e\]','\[u\]');%replace [e] with [u] -dummyHostEU = createModel(); -%makeDummyModel(2 * size(exMets2, 1), size(exMets2, 1)); -hostmets = host.mets(exMets2); -dummyHostEUmets = [strrep(strrep(hostmets, 'Host_', ''), '[e]', '[u]'); hostmets]; -dummyHostEU = addMultipleMetabolites(dummyHostEU,dummyHostEUmets); -nMets = numel(hostmets); -S = [-speye(nMets),speye(nMets)]; -lbs = repmat(-1000,nMets,1); -ubs = repmat(1000,nMets,1); -names = strrep(strcat('Host_IEX_', strrep(hostmets, 'Host_', ''), 'tr'), '[e]', '[u]'); -dummyHostEU = addMultipleReactions(dummyHostEU,names,dummyHostEUmets,S','lb',lbs,'ub',ubs); -% for j = 1:size(exMets2, 1) -% dummyHostEU.rxns{j, 1} = strrep(strcat('Host_IEX_', strrep(host.mets{exMets2(j), 1}, 'Host_', ''), 'tr'), '[e]', '[u]'); -% metU = find(ismember(dummyHostEU.mets, strrep(strrep(host.mets{exMets2(j)}, 'Host_', ''), '[e]', '[u]'))); -% metE = find(ismember(dummyHostEU.mets, host.mets{exMets2(j)})); -% dummyHostEU.S(metU, j) = 1; -% dummyHostEU.S(metE, j) = -1; -% dummyHostEU.lb(j) = -1000; -% dummyHostEU.ub(j) = 1000; -% end -[host] = mergeTwoModels(dummyHostEU, host, 2, false, false); -end - - -%% create a new extracellular space [u] for microbes, code runs in parallel -modelStorage = cell(size(microbeNames)); -% MexGJoined=MexGHost; -parfor j = 1:size(microbeNames, 1) - model = readCbModel([modPath filesep microbeNames{j,1} '.mat']); + exMets = find(~cellfun(@isempty, strfind(host.mets, '[e]'))); % find all mets that appear in [e] + exRxns = host.rxns(strncmp('EX_', host.rxns, 3)); % find exchanges in host + exMetRxns = find(sum(abs(host.S(exMets, :)), 1) ~= 0); % find reactions that contain mets from [e] + exMetRxns = exMetRxns'; + exMetRxnsMets = find(sum(abs(host.S(:, exMetRxns)), 2) ~= 0); % get all metabolites of [e] containing rxns + dummyHostB = createModel(); %makeDummyModel(size(exMetRxnsMets, 1), size(exMetRxns, 1)); + dummyHostB = addMultipleMetabolites(dummyHostB,strcat({'Host_'}, regexprep(host.mets(exMetRxnsMets), '\[e\]', '\[b\]'))); + dummyHostB = addMultipleReactions(dummyHostB,strcat({'Host_'}, host.rxns(exMetRxns), {'b'}),dummyHostB.mets,host.S(exMetRxnsMets, exMetRxns),'c',host.c(exMetRxns),'lb',host.lb(exMetRxns),'ub',host.ub(exMetRxns)); + %dummyHostB.rxns = ; + %dummyHostB.mets = strcat({'Host_'}, regexprep(host.mets(exMetRxnsMets), '\[e\]', '\[b\]')); % replace [e] with [b] + %dummyHostB.S = host.S(exMetRxnsMets, exMetRxns); + %dummyHostB.c = host.c(exMetRxns); + %dummyHostB.lb = host.lb(exMetRxns); + %dummyHostB.ub = host.ub(exMetRxns); - % removing possible constraints of the bacs - selExc = findExcRxns(model); - Reactions2 = model.rxns(find(selExc)); - allex = Reactions2(strmatch('EX', Reactions2)); - biomass = allex(strmatch(objre, allex)); - finrex = setdiff(allex, biomass); - model = changeRxnBounds(model, finrex, -1000, 'l'); - model = changeRxnBounds(model, finrex, 1000, 'u'); - % removing blocked reactions from the bacs - %BlockedRxns = identifyFastBlockedRxns(model,model.rxns, printLevel); - %model= removeRxns(model, BlockedRxns); - %BlockedReaction = findBlockedReaction(model,'L2') + % remove exchange reactions from host while leaving demand and sink + % reactions + host = removeRxns(host, exRxns); + host.mets = strcat({'Host_'}, host.mets); + host.rxns = strcat({'Host_'}, host.rxns); - model = convertOldStyleModel(model); - exmod = model.rxns(strncmp('EX_', model.rxns, 3)); % find exchange reactions - eMets = model.mets(~cellfun(@isempty, strfind(model.mets, '[e]'))); % exchanged metabolites - dummyMicEU = createModel(); - %dummyMicEU = makeDummyModel(2 * size(eMets, 1), size(eMets, 1)); - dummyMicEUmets = [strcat(strcat(microbeNames{j, 1}, '_'), regexprep(eMets, '\[e\]', '\[u\]')); regexprep(eMets, '\[e\]', '\[u\]')]; - dummyMicEU = addMultipleMetabolites(dummyMicEU,dummyMicEUmets); - nMets = numel(eMets); - S = [speye(nMets);-speye(nMets)]; + % use mergeToModels without combining genes + [host] = mergeTwoModels(dummyHostB, host, 2, false, false); + + % Change remaining [e] (transporters) to [u] to transport diet metabolites + exMets2 = ~cellfun(@isempty, strfind(host.mets, '[e]')); % again, find all mets that appear in [e] + % exMetRxns2=find(sum(host.S(exMets2,:),1)~=0);%find reactions that contain mets from [e] + % exMetRxns2=exMetRxns2'; + % exMetRxnsMets2=find(sum(host.S(:,exMetRxns2),2)~=0);%get all metabolites of [e] containing rxns + % host.mets=regexprep(host.mets,'\[e\]','\[u\]');%replace [e] with [u] + dummyHostEU = createModel(); + %makeDummyModel(2 * size(exMets2, 1), size(exMets2, 1)); + hostmets = host.mets(exMets2); + dummyHostEUmets = [strrep(strrep(hostmets, 'Host_', ''), '[e]', '[u]'); hostmets]; + dummyHostEU = addMultipleMetabolites(dummyHostEU,dummyHostEUmets); + nMets = numel(hostmets); + S = [-speye(nMets),speye(nMets)]; lbs = repmat(-1000,nMets,1); ubs = repmat(1000,nMets,1); - names = strcat(strcat(microbeNames{j, 1}, '_'), 'IEX_', regexprep(eMets, '\[e\]', '\[u\]'), 'tr'); - dummyMicEU = addMultipleReactions(dummyMicEU,names,dummyMicEUmets,S,'lb',lbs,'ub',ubs); - model = removeRxns(model, exmod); - model.rxns = strcat(strcat(microbeNames{j, 1}, '_'), model.rxns); - model.mets = strcat(strcat(microbeNames{j, 1}, '_'), regexprep(model.mets, '\[e\]', '\[u\]')); % replace [e] with [u] - [model] = mergeTwoModels(dummyMicEU, model, 2, false, false); - modelStorage{j, 1} = model; % store model + names = strrep(strcat('Host_IEX_', strrep(hostmets, 'Host_', ''), 'tr'), '[e]', '[u]'); + dummyHostEU = addMultipleReactions(dummyHostEU,names,dummyHostEUmets,S','lb',lbs,'ub',ubs); + % for j = 1:size(exMets2, 1) + % dummyHostEU.rxns{j, 1} = strrep(strcat('Host_IEX_', strrep(host.mets{exMets2(j), 1}, 'Host_', ''), 'tr'), '[e]', '[u]'); + % metU = find(ismember(dummyHostEU.mets, strrep(strrep(host.mets{exMets2(j)}, 'Host_', ''), '[e]', '[u]'))); + % metE = find(ismember(dummyHostEU.mets, host.mets{exMets2(j)})); + % dummyHostEU.S(metU, j) = 1; + % dummyHostEU.S(metE, j) = -1; + % dummyHostEU.lb(j) = -1000; + % dummyHostEU.ub(j) = 1000; + % end + [host] = mergeTwoModels(dummyHostEU, host, 2, false, false); end -%% Merge the models in a parallel way - -% Find the base 2 log of the number of models (how many branches are needed), and merge the models two by two: -% In each column of model storage the number of models decreases of half -%(because they have been pairwise merged) till the last column where only -% one big model is contained. The models that are not pairwise merged -%(because number of rows is not even ) are stored and then merged -% sequentially to the big model. - - -pos = {}; % array where the position of models that cannot be merged pairwise (because their number in that iter is not -% even) in the original modelStorage vector is stored -dim = size(microbeNames, 1); -for j = 2:(floor(log2(size(microbeNames, 1))) + 1) % +1 because it starts with one column shifted - if mod(dim, 2) == 1 % check if number is even or not - halfdim = dim - 1; % approximated half dimension (needed to find how many iters to do - % for the pairwise merging - pos{1, j} = halfdim + 1; % find index of extramodel - halfdim = halfdim / 2; - else - halfdim = dim / 2; % no need for approximation +if buildSetupAll + % Merge the models in a parallel way + % First load the stored models with lumen compartment in place + modelStorage = {}; + for i = 1:size(microbeNames, 1) + loadedModel = readCbModel([modelStoragePath filesep microbeNames{i,1} '.mat']); + modelStorage{i, 1} = loadedModel; end - FirstSaveStore=modelStorage(:,(j-1)); - % SecondSaveStore=modelStorage(:,(j-1)); %changes 010318 - modelStorage(1:(dim-1),(j-1))={[]}; %this line will erase all the models from the container - %with the only exception of the last one that might be needed to be - %merged separately. This prevents a dramatic increase in ram usage in - %each iteration as result of stoaring all the merging three. - parfor k=1:halfdim - parind = k; - parind=parind+(k-1); - FirstMod=FirstSaveStore(parind); - % SecondMod=SecondSaveStore(parind+1);%changes 010318 - SecondMod=FirstSaveStore(parind+1);%changes 010318 - % modelStorage{k,j} = mergeTwoModels(FirstMod{1},SecondMod{1},1,false,false)%changes 010318 - modelStorage{k,j} = mergeTwoModels(FirstMod{1},SecondMod{1},1,false,false) + % Find the base 2 log of the number of models (how many branches are needed), and merge the models two by two: + % In each column of model storage the number of models decreases of half + %(because they have been pairwise merged) till the last column where only + % one big model is contained. The models that are not pairwise merged + %(because number of rows is not even ) are stored and then merged + % sequentially to the big model. + + pos = {}; % array where the position of models that cannot be merged pairwise (because their number in that iter is not + % even) in the original modelStorage vector is stored + dim = size(microbeNames, 1); + for j = 2:(floor(log2(size(microbeNames, 1))) + 1) % +1 because it starts with one column shifted + if mod(dim, 2) == 1 % check if number is even or not + halfdim = dim - 1; % approximated half dimension (needed to find how many iters to do + % for the pairwise merging + pos{1, j} = halfdim + 1; % find index of extramodel + halfdim = halfdim / 2; + else + halfdim = dim / 2; % no need for approximation + end + FirstSaveStore=modelStorage(:,(j-1)); + % SecondSaveStore=modelStorage(:,(j-1)); %changes 010318 + modelStorage(1:(dim-1),(j-1))={[]}; %this line will erase all the models from the container + %with the only exception of the last one that might be needed to be + %merged separately. This prevents a dramatic increase in ram usage in + %each iteration as result of stoaring all the merging three. + + for k=1:halfdim + parind = k; + parind=parind+(k-1); + FirstMod=FirstSaveStore(parind); + % SecondMod=SecondSaveStore(parind+1);%changes 010318 + SecondMod=FirstSaveStore(parind+1);%changes 010318 + % modelStorage{k,j} = mergeTwoModels(FirstMod{1},SecondMod{1},1,false,false)%changes 010318 + modelStorage{k,j} = mergeTwoModels(FirstMod{1},SecondMod{1},1,false,false); + end + dim = halfdim; end - dim = halfdim; -end - -% Merging the models remained alone and non-pairwise matched -if isempty(pos)== 1 %all the models were pairwise-merged -[model] = modelStorage{1,(floor(log2(size(microbeNames,1)))+1)}; -else - position = pos(1,:); %finding positions of non merged models - nexmod = find(~cellfun(@isempty,pos(1,:))); - toMerge = cell2mat(position(nexmod));%list of models still to merge - if (length(toMerge)) > 1 %more than 1 model was not pairwise merged - for k=2:(length(toMerge)+1) - if k==2 - [model] = mergeTwoModels(modelStorage{toMerge(1,k-1),(nexmod(k-1))-1},modelStorage{toMerge(1,k),(nexmod(k))-1},1,false,false); - elseif k > 3 - [model] = mergeTwoModels(modelStorage{toMerge(1,k-1),(nexmod(k-1))-1},model,1,false,false); + + % Merging the models remained alone and non-pairwise matched + if isempty(pos)== 1 %all the models were pairwise-merged + [model] = modelStorage{1,(floor(log2(size(microbeNames,1)))+1)}; + else + position = pos(1,:); %finding positions of non merged models + nexmod = find(~cellfun(@isempty,pos(1,:))); + toMerge = cell2mat(position(nexmod));%list of models still to merge + if (length(toMerge)) > 1 %more than 1 model was not pairwise merged + for k=2:(length(toMerge)+1) + if k==2 + [model] = mergeTwoModels(modelStorage{toMerge(1,k-1),(nexmod(k-1))-1},modelStorage{toMerge(1,k),(nexmod(k))-1},1,false,false); + elseif k > 3 + [model] = mergeTwoModels(modelStorage{toMerge(1,k-1),(nexmod(k-1))-1},model,1,false,false); + end end + [model] = mergeTwoModels(modelStorage{1,(floor(log2(size(microbeNames,1)))+1)},model,1,false,false); + end + if (length(toMerge)) == 1 %1 model was not pairwise merged + [model] = mergeTwoModels(modelStorage{1,(floor(log2(size(microbeNames,1)))+1)},modelStorage{toMerge(1,1),(nexmod-1)},1,false,false); end - [model] = mergeTwoModels(modelStorage{1,(floor(log2(size(microbeNames,1)))+1)},model,1,false,false); end - if (length(toMerge)) == 1 %1 model was not pairwise merged - [model] = mergeTwoModels(modelStorage{1,(floor(log2(size(microbeNames,1)))+1)},modelStorage{toMerge(1,1),(nexmod-1)},1,false,false); + +else + % merge in non-parallel way + for i = 2:size(microbeNames, 1) + if i==2 + model1 = readCbModel([modelStoragePath filesep microbeNames{1,1} '.mat']); + modelNew = readCbModel([modelStoragePath filesep microbeNames{i,1} '.mat']); + model = mergeTwoModels(model1,modelNew,1,false,false); + else + modelNew = readCbModel([modelStoragePath filesep microbeNames{i,1} '.mat']); + model = mergeTwoModels(model,modelNew,1,false,false); + end end end +% Merging with host if present + +% temp fix +if isfield(model,'C') + model=rmfield(model,'C'); +end +% -% Merging with host if present if ~isempty(host) [model] = mergeTwoModels(host,model,1,false,false); end [model] = mergeTwoModels(dummy,model,2,false,false); -%finish up by A: removing duplicate reactions -%We will lose information here, but we will just remove the duplicates. -[model,rxnToRemove,rxnToKeep]= checkDuplicateRxn(model,'S',1,0,1); - end - diff --git a/src/analysis/multiSpecies/microbiomeModelingToolbox/mgPipe/getIndividualSizeName.m b/src/analysis/multiSpecies/microbiomeModelingToolbox/mgPipe/getIndividualSizeName.m index f0e4397669..3c84985fa8 100755 --- a/src/analysis/multiSpecies/microbiomeModelingToolbox/mgPipe/getIndividualSizeName.m +++ b/src/analysis/multiSpecies/microbiomeModelingToolbox/mgPipe/getIndividualSizeName.m @@ -1,17 +1,20 @@ -function [sampNames, organisms] = getIndividualSizeName(abunFilePath) +function [sampNames, organisms, exMets] = getIndividualSizeName(abunFilePath,modPath) % This function automatically detects organisms, names and number of individuals present % in the study. % % USAGE: % -% [sampNames, organisms] = getIndividualSizeName(abunFilePath) +% [sampNames, organisms, exMets] = getIndividualSizeName(abunFilePath) % % INPUTS: -% abunFilePath: char with path and name of file from which to retrieve information +% abunFilePath: char with path and name of file from which to retrieve information +% modPath: char with path of directory where models are stored % % OUTPUTS: -% sampNamess: nx1 cell array cell array with names of individuals in the study -% organisms: nx1 cell array cell array with names of organisms in the study +% sampNames: nx1 cell array cell array with names of individuals in the study +% organisms: nx1 cell array cell array with names of organisms in the study +% exMets: cell array with all unique extracellular metabolites +% contained in the models % % .. Author: Federico Baldini 2017-2018 % Almut Heinken, 03/2021: simplified inputs @@ -65,4 +68,17 @@ % getting info on present strains organisms = oldsampNames(2:height(oldsampNames), 2); organisms = table2cell(organisms); % extracted names of models + +parfor i = 1:length(organisms) % find the unique set of all the reactions contained in the models + model =readCbModel([modPath filesep organisms{i,1} '.mat']); + models{i, 1} = model; +end + +exMets = {}; +for i = 1:length(organisms) % find the unique set of all the reactions contained in the models + model = models{i, 1}; + findmets = model.mets(find(contains(model.mets,'[e]'))); + exMets = union(exMets,findmets); +end + end diff --git a/src/analysis/multiSpecies/microbiomeModelingToolbox/mgPipe/getMappingInfo.m b/src/analysis/multiSpecies/microbiomeModelingToolbox/mgPipe/getMappingInfo.m index 40ec164c02..60052bed07 100755 --- a/src/analysis/multiSpecies/microbiomeModelingToolbox/mgPipe/getMappingInfo.m +++ b/src/analysis/multiSpecies/microbiomeModelingToolbox/mgPipe/getMappingInfo.m @@ -31,7 +31,6 @@ % % .. Author: Federico Baldini 2017-2018 - reac = {}; % array with unique set of all the reactions present in the models exMets = {}; % array with unique set of all the extracellular metabolites present in the models diff --git a/src/analysis/multiSpecies/microbiomeModelingToolbox/mgPipe/initMgPipe.m b/src/analysis/multiSpecies/microbiomeModelingToolbox/mgPipe/initMgPipe.m index eb1fa8ddca..fbb6176b4c 100755 --- a/src/analysis/multiSpecies/microbiomeModelingToolbox/mgPipe/initMgPipe.m +++ b/src/analysis/multiSpecies/microbiomeModelingToolbox/mgPipe/initMgPipe.m @@ -1,9 +1,10 @@ function [init, netSecretionFluxes, netUptakeFluxes, Y, modelStats, summary, statistics] = initMgPipe(modPath, abunFilePath, computeProfiles, varargin) -% This function is called from the MgPipe driver `StartMgPipe` takes care of saving some variables -% in the environment (in case that the function is called without a driver), does some checks on the -% inputs, and automatically launches MgPipe. As matter of fact, if all the inputs are properly inserted -% in the function it can replace the driver. - +% This function initializes the mgPipe pipeline and sets the optional input +% variables if not defined. +% +% USAGE +% [init, netSecretionFluxes, netUptakeFluxes, Y, modelStats, summary, statistics] = initMgPipe(modPath, abunFilePath, computeProfiles, varargin) +% % INPUTS: % modPath: char with path of directory where models are stored % abunFilePath: char with path and name of file from which to retrieve abundance information @@ -12,7 +13,10 @@ % % OPTIONAL INPUTS: % resPath: char with path of directory where results are saved -% dietFilePath: char with path of directory where the diet is saved +% dietFilePath: char with path of directory where the diet is saved. +% Can also be a character array with a separate diet for +% each individual, in that case, size(dietFilePath,1) +% needs to equal the length of samples. % infoFilePath: char with path to stratification criteria if available % hostPath: char with path to host model, e.g., Recon3D (default: empty) % hostBiomassRxn: char with name of biomass reaction in host (default: empty) @@ -55,6 +59,8 @@ % - Almut Heinken 02/2021: added option for creation of each % personalized model separately and % output of model stats +% - Almut Heinken 03/2021: inserted error message if +% abundances are not normalized. % Define default input parameters if not specified @@ -128,6 +134,13 @@ error('Path to file with dietary information is incorrect!'); end +% test if abundances are normalized +abundance = table2cell(readtable(abunFilePath,'ReadVariableNames',false)); +totalAbun=sum(str2double(abundance(2:end,2:end)),1); +if any(totalAbun > 1.01) + error('Abundances are not normalized. Please run the function normalizeCoverage!') +end + if strcmp(infoFilePath, '') patStat = false; else diff --git a/src/analysis/multiSpecies/microbiomeModelingToolbox/mgPipe/mgPipe.m b/src/analysis/multiSpecies/microbiomeModelingToolbox/mgPipe/mgPipe.m index ca6be02951..c88d0ef576 100755 --- a/src/analysis/multiSpecies/microbiomeModelingToolbox/mgPipe/mgPipe.m +++ b/src/analysis/multiSpecies/microbiomeModelingToolbox/mgPipe/mgPipe.m @@ -1,5 +1,5 @@ function [netSecretionFluxes, netUptakeFluxes, Y, modelStats, summary, statistics] = mgPipe(modPath, abunFilePath, computeProfiles, resPath, dietFilePath, infoFilePath, hostPath, hostBiomassRxn, hostBiomassRxnFlux, objre, buildSetupAll, saveConstrModels, figForm, numWorkers, rDiet, pDiet, includeHumanMets, lowerBMBound, repeatSim, adaptMedium) -% MgPipe is a MATLAB based pipeline to integrate microbial abundances +% mgPipe is a MATLAB based pipeline to integrate microbial abundances % (coming from metagenomic data) with constraint based modeling, creating % individuals' personalized models. % The pipeline is divided in 3 parts: @@ -9,7 +9,7 @@ % integrating abundance data retrieved from metagenomics. For each organism, % reactions are coupled to the objective function. % [PART 3] Simulations under different diet regimes. -% MgPipe was created (and tested) for AGORA 1.0 please first download AGORA +% mgPipe was created (and tested) for AGORA 1.0 please first download AGORA % version 1.0 from https://www.vmh.life/#downloadview and place the mat files % into a folder. % @@ -28,7 +28,7 @@ % hostBiomassRxn: char with name of biomass reaction in host (default: empty) % hostBiomassRxnFlux: double with the desired flux through the host % biomass reaction (default: zero) -% objre: char with reaction name of objective function of microbeNames +% objre: char with reaction name of objective function % buildSetupAll: boolean indicating the strategy that should be used to % build personalized models: if true, build a global setup model % containing all organisms in at least model (default), false: create @@ -65,11 +65,6 @@ % - Almut Heinken, 01/21: added option for creation of each personalized model separately %% PIPELINE: [PART 1] -% The number of microbeNames, their names, the number of samples and their identifiers -% are automatically detected from the input file. - -[sampNames,microbeNames]=getIndividualSizeName(abunFilePath); -%% % If PART1 was already % computed: if the associated file is already present in the results folder its % execution is skipped else its execution starts @@ -93,20 +88,61 @@ end end - % Computing genetic information - [reac,exMets,micRea,binOrg,patOrg,reacPat,reacNumb,reacSet,reacTab,reacAbun,reacNumber]=getMappingInfo(modPath,microbeNames,abunFilePath); - writetable(cell2table(reacAbun,'VariableNames',['Reactions';sampNames]'),strcat(resPath,'reactions.csv')); + abundance = table2cell(readtable(abunFilePath)); - %Create tables and save all the created variables - reacTab=[array2table(reac),array2table(reacTab,'VariableNames',sampNames')],[resPath 'compfile' filesep 'ReacTab.csv']; - reacSet=cell2table(reacSet,'VariableNames',sampNames'); - reacPat=[array2table(microbeNames),array2table(reacPat,'VariableNames',sampNames')]; -end + % The number of microbeNames, their names, the number of samples and their identifiers + % are automatically detected from the input file. + [sampNames,microbeNames,exMets]=getIndividualSizeName(abunFilePath,modPath); + + % remove rows of organisms that are not present in any sample + microbeNames(sum(cell2mat(abundance(:,2:end)),2)<0.0000001,:)=[]; + abundance(sum(cell2mat(abundance(:,2:end)),2)<0.0000001,:)=[]; + + % Extracellular spaces simulating the lumen are built and stored for + % each microbe. + [exch,modelStoragePath]=buildModelStorage(microbeNames,modPath); + + % Computing reaction presence + ReactionPresence=calculateReactionPresence(abunFilePath, modPath, {}); + writetable(cell2table(ReactionPresence),[resPath filesep 'ReactionPresence.csv'], 'WriteVariableNames', false); -% Plotting genetic information -[PCoA]=plotMappingInfo(resPath,patOrg,reacPat,reacTab,reacNumber,infoFilePath,figForm,sampNames,microbeNames); + % Computing reaction abundance + ReactionAbundance = fastCalculateReactionAbundance(abunFilePath, modPath, {}, numWorkers); + writetable(cell2table(ReactionAbundance'),[resPath filesep 'ReactionAbundance.csv'], 'WriteVariableNames', false); + + % Computing subsystem abundance + subsystemAbundance = calculateSubsystemAbundance([resPath filesep 'ReactionAbundance.csv']); + writetable(cell2table(subsystemAbundance),[resPath filesep 'SubsystemAbundance.csv'], 'WriteVariableNames', false); -save([resPath filesep 'mapInfo.mat'],'binOrg', 'mapP', 'exMets', 'micRea', 'patOrg', 'PCoA', 'reac', 'reacAbun', 'reacNumb', 'reacNumber', 'reacPat', 'reacSet', 'reacTab', 'sampNames', 'microbeNames') + % plot subsystem abundance + data=cell2mat(subsystemAbundance(2:end,2:end)); + xlabels=subsystemAbundance(1,2:end); + ylabels=subsystemAbundance(2:end,1); + figure; + imagesc(data) + colormap('hot') + colorbar + if length(xlabels)<30 + set(gca,'xtick',1:length(xlabels)); + xticklabels(xlabels); + xtickangle(90) + end + set(gca,'ytick',1:length(ylabels)); + yticklabels(ylabels); + ax=gca; + + if length(ylabels)<50 + ax.YAxis.FontSize = 8; + else + ax.YAxis.FontSize = 6; + end + set(gca,'TickLabelInterpreter', 'none'); + title('Relative reaction abundances summarized by subsystem') + print(strcat(resPath, 'Subsystem_abundances'), figForm) + + % save mapping info + save([resPath filesep 'mapInfo.mat'], 'mapP', 'exMets', 'exch', 'sampNames', 'microbeNames', 'modelStoragePath','abundance','-v7.3') +end %end of trigger for Autoload %% PIPELINE: [PART 2.1] @@ -158,16 +194,19 @@ % are also added. Models that are already existent will not be recreated, and % new microbiota models will be saved in the results folder. -abundance = table2cell(readtable(abunFilePath)); - -% remove rows of organisms that are not present in any sample -abundance(sum(cell2mat(abundance(:,2:end)),2)<0.0000001,:)=[]; +% set parallel pool +if numWorkers > 1 + poolobj = gcp('nocreate'); + if isempty(poolobj) + parpool(numWorkers) + end +end % if there is 500 reconstruction total or less, use fast setup creator to % carve each personalized model from one large setup model. if buildSetupAll if modbuild == 1 - setup=fastSetupCreator(modPath, microbeNames, host, objre, numWorkers); + setup=fastSetupCreator(exch, modelStoragePath, microbeNames, host, objre, buildSetupAll); setup.name='Global reconstruction with lumen / fecal compartments no host'; setup.recon=0; if ~isempty(host) @@ -178,7 +217,11 @@ end if modbuild==0 - load(strcat(resPath,'Setup_allbacs.mat')) + if ~isempty(host) + load(strcat(resPath,'Setup_host_allbacs.mat')) + else + load(strcat(resPath,'Setup_allbacs.mat')) + end end [createdModels]=createPersonalizedModel(abundance,resPath,setup,sampNames,microbeNames,host,hostBiomassRxn); @@ -188,28 +231,31 @@ % define what counts as zero abundance tol=0.0000001; - for i=1:length(sampNames) - % Here, we will not be starting from one joined model containing all - % reconstructions. Instead, each personalized model will be created separately.) - % get the list of models for each sample and remove the ones not in - % this sample - - % retrieving current model ID - if ~isempty(host) - mId = ['host_microbiota_model_samp_', sampNames{i,1}, '.mat']; + clear('microbeNames','exMets','abundance') + + steps=50; + % proceed in batches for improved effiency + for j=1:steps:length(sampNames) + if length(sampNames)-j>=steps-1 + endPnt=steps-1; else - mId = ['microbiota_model_samp_', sampNames{i,1}, '.mat']; + endPnt=length(sampNames)-j; end - % if the model doesn't exist yet - mapP = detectOutput(resPath, mId); - if isempty(mapP) - microbeNamesSample = microbeNames; - abunRed=abundance(:,i+1); - abunRed=[abundance(:,1),abunRed]; + parfor i=j:j+endPnt + % Here, we will not be starting from one joined model containing all + % reconstructions. Instead, each personalized model will be created separately.) + % get the list of models for each sample and remove the ones not in + % this sample + mappingData=load([resPath filesep 'mapInfo.mat']) + microbeNamesSample = mappingData.microbeNames; + abunRed=mappingData.abundance(:,i+1); + abunRed=[mappingData.abundance(:,1),abunRed]; microbeNamesSample(cell2mat(abunRed(:,2)) < tol,:)=[]; abunRed(cell2mat(abunRed(:,2)) < tol,:)=[]; - setupModel = fastSetupCreator(modPath, microbeNamesSample, host, objre, numWorkers); + setupModel = fastSetupCreator(exch, modelStoragePath, microbeNamesSample, host, objre, buildSetupAll); + + % create personalized models for the batch createdModel=createPersonalizedModel(abunRed,resPath,setupModel,sampNames(i,1),microbeNamesSample,host,hostBiomassRxn); end end @@ -222,13 +268,16 @@ % analysis for all the exchange reactions of the diet and fecal compartment is % also computed and saved in a file called "simRes". -[exchanges, netProduction, netUptake, presol, inFesMat] = microbiotaModelSimulator(resPath, exMets, sampNames, dietFilePath, hostPath, hostBiomassRxn, hostBiomassRxnFlux, numWorkers, rDiet, pDiet, saveConstrModels, computeProfiles, includeHumanMets, lowerBMBound, repeatSim, adaptMedium); -% Finally, NMPCs (net maximal production capability) are computed in a metabolite -% resolved manner and saved in a comma delimited file in the results folder. NMPCs -% indicate the maximal production of each metabolite and are computing summing -% the maximal secretion flux with the maximal uptake flux. Similarity of metabolic -% profiles (using the different NMPCs as features) between individuals are also -% evaluated with classical multidimensional scaling. +load([resPath filesep 'mapInfo.mat']) +if computeProfiles || saveConstrModels + [exchanges, netProduction, netUptake, presol, inFesMat] = microbiotaModelSimulator(resPath, exMets, sampNames, dietFilePath, hostPath, hostBiomassRxn, hostBiomassRxnFlux, numWorkers, rDiet, pDiet, saveConstrModels, computeProfiles, includeHumanMets, lowerBMBound, repeatSim, adaptMedium); + % Finally, NMPCs (net maximal production capability) are computed in a metabolite + % resolved manner and saved in a comma delimited file in the results folder. NMPCs + % indicate the maximal production of each metabolite and are computing summing + % the maximal secretion flux with the maximal uptake flux. Similarity of metabolic + % profiles (using the different NMPCs as features) between individuals are also + % evaluated with classical multidimensional scaling. +end if computeProfiles [netSecretionFluxes, netUptakeFluxes, Y] = mgSimResCollect(resPath, sampNames, exchanges, rDiet, pDiet, infoFilePath, netProduction, netUptake, figForm); @@ -237,6 +286,7 @@ netUptakeFluxes={}; Y=[]; delete('simRes.mat','intRes.mat') + rmdir([resPath filesep 'modelStorage'],'s') end % get stats on microbiome models-number of reactions and metabolites diff --git a/src/analysis/multiSpecies/microbiomeModelingToolbox/mgPipe/microbiotaModelSimulator.m b/src/analysis/multiSpecies/microbiomeModelingToolbox/mgPipe/microbiotaModelSimulator.m index d7bdb1ca09..676a5d6132 100755 --- a/src/analysis/multiSpecies/microbiomeModelingToolbox/mgPipe/microbiotaModelSimulator.m +++ b/src/analysis/multiSpecies/microbiomeModelingToolbox/mgPipe/microbiotaModelSimulator.m @@ -50,6 +50,13 @@ % .. Author: Federico Baldini, 2017-2018 % Almut Heinken, 03/2021: simplified inputs +% set a solver if not done yet +global CBT_LP_SOLVER +solver = CBT_LP_SOLVER; +if isempty(solver) + initCobraToolbox(false); %Don't update the toolbox automatically +end + for i=1:length(exMets) exchanges{i,1} = ['EX_' exMets{i}]; end @@ -219,12 +226,6 @@ end end - % set a solver if not done yet - global CBT_LP_SOLVER - solver = CBT_LP_SOLVER; - if isempty(solver) - initCobraToolbox(false); %Don't update the toolbox automatically - end solution_allOpen = solveCobraLP(buildLPproblemFromModel(model)); % solution_allOpen=solveCobraLPCPLEX(model,2,0,0,[],0); if solution_allOpen.stat==0 diff --git a/src/analysis/multiSpecies/microbiomeModelingToolbox/mgPipe/normalizeCoverage.m b/src/analysis/multiSpecies/microbiomeModelingToolbox/mgPipe/normalizeCoverage.m index 738a86b684..a595266b9b 100755 --- a/src/analysis/multiSpecies/microbiomeModelingToolbox/mgPipe/normalizeCoverage.m +++ b/src/analysis/multiSpecies/microbiomeModelingToolbox/mgPipe/normalizeCoverage.m @@ -31,6 +31,17 @@ coverage = table2cell(readtable(abunFilePath,'ReadVariableNames',false)); coverage{1,1}='ID'; + +% delete samples that are all zeros (if applies) +totalAbun=sum(str2double(coverage(2:end,2:end)),1); +allzero=find(totalAbun<0.0000001); +if ~isempty(allzero) + for i=1:length(allzero) + allzero(i)=allzero(i)+1; + end + coverage(:,allzero)=[]; +end + abundanceNew = coverage; for i=2:size(coverage,2) diff --git a/src/analysis/multiSpecies/microbiomeModelingToolbox/mgPipe/plotMappingInfo.m b/src/analysis/multiSpecies/microbiomeModelingToolbox/mgPipe/plotMappingInfo.m index 84aa5b8f32..0c46a12561 100755 --- a/src/analysis/multiSpecies/microbiomeModelingToolbox/mgPipe/plotMappingInfo.m +++ b/src/analysis/multiSpecies/microbiomeModelingToolbox/mgPipe/plotMappingInfo.m @@ -185,7 +185,7 @@ % hold on % end - print(strcat(resPath, 'PCoA reactions'), figForm) + print(strcat(resPath, 'PCoA_reactions'), figForm) else disp('noPcoA will be plotted') end diff --git a/src/analysis/multiSpecies/microbiomeModelingToolbox/pairwiseInteractionModeling/createMultipleSpeciesModel.m b/src/analysis/multiSpecies/microbiomeModelingToolbox/pairwiseInteractionModeling/createMultipleSpeciesModel.m index 474466e5b3..70aa2e5b99 100644 --- a/src/analysis/multiSpecies/microbiomeModelingToolbox/pairwiseInteractionModeling/createMultipleSpeciesModel.m +++ b/src/analysis/multiSpecies/microbiomeModelingToolbox/pairwiseInteractionModeling/createMultipleSpeciesModel.m @@ -162,7 +162,7 @@ for i = 1:modelNumber model=models{i, 1}; biomassReaction=model.rxns(find(strncmp(model.rxns, 'bio', 3))); - model = removeBalancedCycles(model, biomassReaction, database,unionRxns); + model = removeFutileCycles(model, biomassReaction, database,unionRxns); models{i, 1} = model; end %% define some variables diff --git a/src/analysis/multiSpecies/microbiomeModelingToolbox/pairwiseInteractionModeling/joinModelsPairwiseFromList.m b/src/analysis/multiSpecies/microbiomeModelingToolbox/pairwiseInteractionModeling/joinModelsPairwiseFromList.m index 4918a7cd0a..8e743d2ace 100755 --- a/src/analysis/multiSpecies/microbiomeModelingToolbox/pairwiseInteractionModeling/joinModelsPairwiseFromList.m +++ b/src/analysis/multiSpecies/microbiomeModelingToolbox/pairwiseInteractionModeling/joinModelsPairwiseFromList.m @@ -56,6 +56,22 @@ function joinModelsPairwiseFromList(modelList, modelFolder, varargin) mergeGenesFlag = parser.Results.mergeGenesFlag; pairwiseModelFolder = parser.Results.pairwiseModelFolder; +% initialize COBRA Toolbox and parallel pool +global CBT_LP_SOLVER +if isempty(CBT_LP_SOLVER) + initCobraToolbox +end +solver = CBT_LP_SOLVER; + +if numWorkers>0 && ~isempty(ver('parallel')) + % with parallelization + poolobj = gcp('nocreate'); + if isempty(poolobj) + parpool(numWorkers) + end +end +environment = getEnvironment(); + pairedModelInfo = {}; cnt = 1; @@ -67,104 +83,66 @@ function joinModelsPairwiseFromList(modelList, modelFolder, varargin) existingModels(find(strcmp(existingModels(:,1),'..')),:)=[]; % then join all models in modelList +inputModels={}; for i = 1:size(modelList, 1) - if numWorkers > 0 - % with parallelization - poolobj = gcp('nocreate'); - if isempty(poolobj) - parpool(numWorkers) - end - pairedModelsTemp = {}; - - % Load the reconstructions to be joined - inputModels={}; - model=readCbModel([modelFolder filesep modelList{i,1} '.mat']); - inputModels{i}=model; - for k = i + 1:size(modelList, 1) - model=readCbModel([modelFolder filesep modelList{k,1} '.mat']); - inputModels{k}=model; - end + + % Load the reconstructions to be joined + model=readCbModel([modelFolder filesep modelList{i,1} '.mat']); + inputModels{i}=model; + for k = i + 1:size(modelList, 1) + model=readCbModel([modelFolder filesep modelList{k,1} '.mat']); + inputModels{k}=model; + end + + pairedModelsTemp = {}; + + parfor k = i + 1:size(modelList, 1) + restoreEnvironment(environment); + changeCobraSolver(solver, 'LP', 0, -1); - parfor k = i + 1:size(modelList, 1) - model1 = inputModels{i}; - model2 = inputModels{k}; - models = { - model1 - model2 - }; - nameTagsModels = { - strcat(modelList{i}, '_') - strcat(modelList{k}, '_') - }; - if ~contains(existingModels,['pairedModel', '_', modelList{i}, '_', modelList{k}, '.mat']) - [pairedModel] = createMultipleSpeciesModel(models, 'nameTagsModels', nameTagsModels,'mergeGenesFlag', mergeGenesFlag); - [pairedModel] = coupleRxnList2Rxn(pairedModel, pairedModel.rxns(strmatch(nameTagsModels{1, 1}, pairedModel.rxns)), strcat(nameTagsModels{1, 1}, model1.rxns(find(strncmp(model1.rxns, 'bio', 3)))), c, u); - [pairedModel] = coupleRxnList2Rxn(pairedModel, pairedModel.rxns(strmatch(nameTagsModels{2, 1}, pairedModel.rxns)), strcat(nameTagsModels{2, 1}, model2.rxns(find(strncmp(model2.rxns, 'bio', 3)))), c, u); - pairedModelsTemp{k} = pairedModel; - else - pairedModelsTemp{k} = {}; - end + model1 = inputModels{i}; + model2 = inputModels{k}; + models = { + model1 + model2 + }; + nameTagsModels = { + strcat(modelList{i}, '_') + strcat(modelList{k}, '_') + }; + if isempty(contains(existingModels,['pairedModel', '_', modelList{i}, '_', modelList{k}, '.mat'])) + [pairedModel] = createMultipleSpeciesModel(models, 'nameTagsModels', nameTagsModels,'mergeGenesFlag', mergeGenesFlag); + [pairedModel] = coupleRxnList2Rxn(pairedModel, pairedModel.rxns(strmatch(nameTagsModels{1, 1}, pairedModel.rxns)), strcat(nameTagsModels{1, 1}, model1.rxns(find(strncmp(model1.rxns, 'bio', 3)))), c, u); + [pairedModel] = coupleRxnList2Rxn(pairedModel, pairedModel.rxns(strmatch(nameTagsModels{2, 1}, pairedModel.rxns)), strcat(nameTagsModels{2, 1}, model2.rxns(find(strncmp(model2.rxns, 'bio', 3)))), c, u); + pairedModelsTemp{k} = pairedModel; + else + pairedModelsTemp{k} = {}; end - for k = i + 1:size(modelList, 1) - % keep track of the generated models and populate the output file with - % information on joined models - model1 = inputModels{i}; - pairedModelInfo{cnt, 1} = ['pairedModel', '_', modelList{i}, '_', modelList{k}, '.mat']; - pairedModelInfo{cnt, 2} = modelList{i}; - pairedModelInfo{cnt, 3} = model1.rxns(find(strncmp(model1.rxns, 'bio', 3))); - model2 = inputModels{k}; - pairedModelInfo{cnt, 4} = modelList{k}; - pairedModelInfo{cnt, 5} = model2.rxns(find(strncmp(model2.rxns, 'bio', 3))); - % save file regularly - if floor(cnt/1000) == cnt/1000 + end + + for k = i + 1:size(modelList, 1) + % keep track of the generated models and populate the output file with + % information on joined models + model1 = inputModels{i}; + pairedModelInfo{cnt, 1} = ['pairedModel', '_', modelList{i}, '_', modelList{k}, '.mat']; + pairedModelInfo{cnt, 2} = modelList{i}; + pairedModelInfo{cnt, 3} = model1.rxns(find(strncmp(model1.rxns, 'bio', 3))); + model2 = inputModels{k}; + pairedModelInfo{cnt, 4} = modelList{k}; + pairedModelInfo{cnt, 5} = model2.rxns(find(strncmp(model2.rxns, 'bio', 3))); + % save file regularly + if floor(cnt/1000) == cnt/1000 save([pairwiseModelFolder filesep 'pairedModelInfo'],'pairedModelInfo'); - end - - if ~contains(existingModels,['pairedModel', '_', modelList{i}, '_', modelList{k}, '.mat']) - pairedModel=pairedModelsTemp{k}; - save([pairwiseModelFolder filesep pairedModelInfo{cnt,1}],'pairedModel'); - end - cnt = cnt + 1; end - else - % without parallelization - for j = i + 1:size(modelList, 1) - model1 = inputModels{i}; - model2 = inputModels{j}; - models = { - model1 - model2 - }; - nameTagsModels = { - strcat(modelList{i}, '_') - strcat(modelList{j}, '_') - }; - % keep track of the generated models and populate the output file with - % information on joined models - pairedModelInfo{cnt, 1} = ['pairedModel', '_', modelList{i}, '_', modelList{j}, '.mat']; - pairedModelInfo{cnt, 2} = modelList{i}; - pairedModelInfo{cnt, 3} = model1.rxns(find(strncmp(model1.rxns, 'bio', 3))); - pairedModelInfo{cnt, 4} = modelList{j}; - pairedModelInfo{cnt, 5} = model2.rxns(find(strncmp(model2.rxns, 'bio', 3))); - % save file regularly - if floor(cnt/1000) == cnt/1000 - save([pairwiseModelFolder filesep 'pairedModelInfo'],'pairedModelInfo'); - end - - if ~contains(existingModels,['pairedModel', '_', modelList{i}, '_', modelList{j}, '.mat']) - [pairedModel] = createMultipleSpeciesModel(models, 'nameTagsModels', nameTagsModels,'mergeGenesFlag', mergeGenesFlag); - [pairedModel] = coupleRxnList2Rxn(pairedModel, pairedModel.rxns(strmatch(nameTagsModels{1, 1}, pairedModel.rxns)), strcat(nameTagsModels{1, 1}, model1.rxns(find(strncmp(model1.rxns, 'bio', 3)))), c, u); - [pairedModel] = coupleRxnList2Rxn(pairedModel, pairedModel.rxns(strmatch(nameTagsModels{2, 1}, pairedModel.rxns)), strcat(nameTagsModels{2, 1}, model2.rxns(find(strncmp(model2.rxns, 'bio', 3)))), c, u); - pairedModels{cnt, 1} = pairedModel; - save([pairwiseModelFolder filesep pairedModelInfo{cnt,1}],'pairedModel'); - else - pairedModels{cnt, 1} = {}; - end - cnt = cnt + 1; + + if isempty(contains(existingModels,['pairedModel', '_', modelList{i}, '_', modelList{k}, '.mat'])) + pairedModel=pairedModelsTemp{k}; + save([pairwiseModelFolder filesep pairedModelInfo{cnt,1}],'pairedModel'); end + cnt = cnt + 1; end end save([pairwiseModelFolder filesep 'pairedModelInfo'],'pairedModelInfo'); -end +end \ No newline at end of file diff --git a/src/analysis/multiSpecies/microbiomeModelingToolbox/pairwiseInteractionModeling/simulatePairwiseInteractions.m b/src/analysis/multiSpecies/microbiomeModelingToolbox/pairwiseInteractionModeling/simulatePairwiseInteractions.m index cad80b01ae..afede59287 100755 --- a/src/analysis/multiSpecies/microbiomeModelingToolbox/pairwiseInteractionModeling/simulatePairwiseInteractions.m +++ b/src/analysis/multiSpecies/microbiomeModelingToolbox/pairwiseInteractionModeling/simulatePairwiseInteractions.m @@ -118,20 +118,20 @@ solutionsTmp={}; if isfile([pairwiseModelFolder filesep 'pairedModelInfo.mat']) -load([pairwiseModelFolder filesep 'pairedModelInfo.mat']) +load([pairwiseModelFolder filesep 'pairedModelInfo.mat'],'pairedModelInfo') else error('Pairwise models have not been created. Please run joinModelsPairwiseFromList first.') end -parfor i = 1:size(pairedModelInfo, 1) +info=pairedModelInfo; +parfor i = 1:size(info, 1) restoreEnvironment(environment); changeCobraSolver(solver, 'LP', 0, -1); changeCobraSolverParams('LP', 'logFile', 0); % load the model - fileToLoad=load([pairwiseModelFolder filesep pairedModelInfo{i,1}]); - toLoad=fieldnames(fileToLoad); - pairedModel=fileToLoad.(toLoad{1}); + filename=[pairwiseModelFolder filesep info{i,1}]; + pairedModel=readCbModel(filename); pairedModelOrg=pairedModel; % if a diet was input if ~isempty(inputDiet) @@ -139,8 +139,8 @@ end % for each paired model, set both biomass objective functions as % objectives - biomass1 = strcat(pairedModelInfo{i, 2}, '_', pairedModelInfo{i, 3}); - biomass2 = strcat(pairedModelInfo{i, 4}, '_', pairedModelInfo{i, 5}); + biomass1 = strcat(info{i, 2}, '_', info{i, 3}); + biomass2 = strcat(info{i, 4}, '_', info{i, 5}); model1biomass = find(ismember(pairedModel.rxns, biomass1)); pairedModel.c(model1biomass, 1) = 1; model2biomass = find(ismember(pairedModel.rxns, biomass2)); @@ -160,7 +160,7 @@ end pairedModel = changeObjective(pairedModel, biomass1); % disable flux through the second model - pairedModel = changeRxnBounds(pairedModel, pairedModel.rxns(strmatch(strcat(pairedModelInfo{i, 4}, '_'), pairedModel.rxns)), 0, 'b'); + pairedModel = changeRxnBounds(pairedModel, pairedModel.rxns(strmatch(strcat(info{i, 4}, '_'), pairedModel.rxns)), 0, 'b'); % calculate single biomass solutionSingle1 = solveCobraLP(buildLPproblemFromModel(pairedModel,false)); % silence model 1 and optimize model 2 @@ -172,7 +172,7 @@ end pairedModel = changeObjective(pairedModel, biomass2); % disable flux through the first model - pairedModel = changeRxnBounds(pairedModel, pairedModel.rxns(strmatch(strcat(pairedModelInfo{i, 2}, '_'), pairedModel.rxns)), 0, 'b'); + pairedModel = changeRxnBounds(pairedModel, pairedModel.rxns(strmatch(strcat(info{i, 2}, '_'), pairedModel.rxns)), 0, 'b'); % calculate single biomass solutionSingle2 = solveCobraLP(buildLPproblemFromModel(pairedModel,false)); diff --git a/src/analysis/thermo/componentContribution/old/invertProjection.m b/src/analysis/thermo/componentContribution/old/invertProjection.m index 335354a566..3cf4dce6b4 100644 --- a/src/analysis/thermo/componentContribution/old/invertProjection.m +++ b/src/analysis/thermo/componentContribution/old/invertProjection.m @@ -1,43 +1,27 @@ function [inv_A, r, P_R, P_N] = invertProjection(A, epsilon) -% Inverts a general matrix A using the pseudoinverse -% -% USAGE: -% -% [inv_A, r, P_R, P_N] = invertProjection(A, epsilon) -% INPUTS: -% A: general matrix -% epsilon: default = 1e-10 -% -% OUTPUTS: -% inv_A: the pseudoinverse of `A` -% r: the rank of `A` -% P_R: the projection matrix onto the `range(A)` -% P_N: the projection matrix onto the `null(A')` +% invert a general matrix A using the pseudoinverse +% return values are: +% inv_A - the pseudoinverse of A +% r - the rank of A +% P_R - the projection matrix onto the range(A) +% P_N - the projection matrix onto the null(A') if nargin < 2 epsilon = 1e-10; end [m, n] = size(A); -if 1 - %[U, S, V] = svd(A); % not working, uncommented line below - Lemmer - %[U, S, V] = svds(A,min(size(A))); % Bugfix due to svd convergence problems - [U, S, V] = svd(full(A),'econ'); %from Michael Saunders code - r = sum(sum(abs(S) > epsilon)); - inv_S = diag(1 ./ S(abs(S) > epsilon)); - inv_A = V(:, 1:r) * inv_S(1:r, 1:r) * U(:, 1:r)'; - P_R = U(:, 1:r) * U(:, 1:r)'; - P_N = U(:, (r+1):end) * U(:, (r+1):end)'; - -else - %Michael Saunders code -TODO integrate this properly - [U1,D1,V1,r] = subspaceSVD(A); - P_R=U1*U1';%projection matrix onto the range(A) - P_N=eye(m) - U1*U1';%projection matrix onto null(A') - inv_A=pinv(A,1e-12); -end - % Michael Saunders code - % [U1,D1,V1,r] = subspaceSVD(A); - % P_R=U1*U1';%projection matrix onto the range(A) - % P_N=eye(m) - U1*U1';%projection matrix onto the null(A') - % inv_A=pinv(A,1e-12); \ No newline at end of file +%[U, S, V] = svd(A); % not working, uncommented line below - Lemmer +[U, S, V] = svds(A,min(size(A))); % Bugfix due to svd convergence problems +%[U, S, V] = svd(full(A),'econ'); %from Michael Saunders code +r = sum(sum(abs(S) > epsilon)); +inv_S = diag(1 ./ S(abs(S) > epsilon)); +inv_A = V(:, 1:r) * inv_S(1:r, 1:r) * U(:, 1:r)'; +P_R = U(:, 1:r) * U(:, 1:r)'; +P_N = U(:, (r+1):end) * U(:, (r+1):end)'; + +% Michael Saunders code +% [U1,D1,V1,r] = subspaceSVD(A); +% P_R=U1*U1';%projection matrix onto the range(A) +% P_N=eye(m) - U1*U1';%projection matrix onto the null(A') +% inv_A=pinv(A,1e-12); diff --git a/src/analysis/thermo/componentContribution/setupComponentContribution.m b/src/analysis/thermo/componentContribution/setupComponentContribution.m new file mode 100644 index 0000000000..60f500c6a1 --- /dev/null +++ b/src/analysis/thermo/componentContribution/setupComponentContribution.m @@ -0,0 +1,132 @@ +function model = setupComponentContribution(model,molFileDir,cid,printLevel) +% Estimates standard transformed reaction Gibbs energy and directionality +% at in vivo conditions in multicompartmental metabolic reconstructions. +% Has external dependencies on the COBRA toolbox, the component +% contribution method, Python (with numpy and Open Babel bindings), +% ChemAxon's Calculator Plugins, and Open Babel. See details on +% availability at the end of help text. +% +% modelT = setupThermoModel(model,molfileDir,cid,T,cellCompartments,ph,... +% is,chi,concMin,concMax,confidenceLevel) +% +% INPUTS +% model Model structure with following fields: +% .S m x n stoichiometric matrix. +% .mets m x 1 array of metabolite identifiers. +% .rxns n x 1 array of reaction identifiers. +% .metFormulas m x 1 cell array of metabolite formulas. Formulas for +% protons should be H, and formulas for water should be +% H2O. +% .metCharges m x 1 numerical array of metabolite charges. +% +% OPTIONAL INPUTS +% molFileDir Path to a directory containing molfiles for the +% major tautomer of the major microspecies of +% each metabolite at pH 7. Molfiles should be +% named with the metabolite identifiers in +% model.mets (without compartment assignments). +% Not required if cid are specified. +% cid m x 1 cell array of KEGG Compound identifiers. +% Not required if molfiledir is specified. +% model.metCompartments m x 1 array of metabolite compartment +% assignments. Not required if metabolite +% identifiers are strings of the format ID[*] +% where * is the appropriate compartment +% identifier. +% +% OUTPUTS +% model Model structure with following additional fields: +% .inchi Structure containing four m x 1 cell array's of +% IUPAC InChI strings for metabolites, with varying +% levels of structural detail. +% .pKa m x 1 structure containing metabolite pKa values +% estimated with ChemAxon's Calculator Plugins. +% .pseudoisomers p x 4 matrix with the following columns: +% 1. Metabolite index. +% 2. Estimated pseudoisomer standard Gibbs energy. +% 3. Number of hydrogen atoms in pseudoisomer +% chemical formula. +% 4. Charge on pseudoisomer. +% +% WRITTEN OUTPUTS +% MetStructures.sdf An SDF containing all structures input to the +% component contribution method for estimation of +% standard Gibbs energies. +% +% +% Ronan M. T. Fleming, Sept. 2012 Version 1.0 +% Hulda S. H., Dec. 2012 Version 2.0 + + +%% Configure inputs +% Retreive molfiles from KEGG if KEGG ID are given. Otherwise use molfiles +% in molfileDir. +if ~exist('cid','var') + cid = []; +end +if ~exist('printLevel','var') + printLevel = 1; +end +%% Get metabolite structures +if ~isempty(cid) + molFileDir = 'molfilesFromKegg'; + fprintf('\nRetreiving molfiles from KEGG.\n'); + takeMajorMS = true; % Convert molfile from KEGG to major tautomer of major microspecies at pH 7 + pH = 7; + takeMajorTaut = true; + kegg2mol(cid,molFileDir,model.mets,takeMajorMS,pH,takeMajorTaut); % Retreive mol files +end + +if printLevel>0 +fprintf('Creating MetStructures.sdf from molfiles.\n') +end + +sdfFileName = 'MetStructures.sdf'; +includeRs = 0; % Do not include structures with R groups in SDF +[sdfMetList,noMolMetList] = mol2sdf(model.mets,molFileDir,sdfFileName,includeRs); + +if printLevel>0 +fprintf('Converting SDF to InChI strings.\n') +end +model.inchi = createInChIStruct(model.mets,sdfFileName); +compositeBool = ~cellfun('isempty',regexp(model.inchi.nonstandard,'\.')); % Remove InChI for composite compounds as they cause problems later. +model.inchi.standard(compositeBool) = cell(sum(compositeBool),1); +model.inchi.standardWithStereo(compositeBool) = cell(sum(compositeBool),1); +model.inchi.standardWithStereoAndCharge(compositeBool) = cell(sum(compositeBool),1); +model.inchi.nonstandard(compositeBool) = cell(sum(compositeBool),1); + +%% Estimate metabolite pKa values with ChemAxon calculator plugins and determine all relevant pseudoisomers. +if printLevel>0 +fprintf('Estimating metabolite pKa values.\n'); +end +npKas = 20; % Number of acidic and basic pKa values to estimate +takeMajorTaut = false; % Estimate pKa for input tautomer. Input tautomer is assumed to be the major tautomer for the major microspecies at pH 7. +model.pseudoisomers = estimate_pKa(model.mets,model.inchi.nonstandard,npKas,takeMajorTaut); % Estimate pKa and determine pseudoisomers +model.pseudoisomers = rmfield(model.pseudoisomers,'met'); + +% Add number of hydrogens and charge for metabolites with no InChI +if any(~[model.pseudoisomers.success]) && printLevel>0 + fprintf('Assuming that metabolite species in model.metFormulas are representative for metabolites where pKa could not be estimated.\n'); +end +nonphysicalMetSpecies = {}; +for i = 1:length(model.mets) + model_z = model.metCharges(i); % Get charge from model + model_nH = numAtomsOfElementInFormula(model.metFormulas{i},'H'); % Get number of hydrogens from metabolite formula in model + if ~model.pseudoisomers(i).success + model.pseudoisomers(i).zs = model_z; + model.pseudoisomers(i).nHs = model_nH; + model.pseudoisomers(i).majorMSpH7 = true; % Assume species in model is the major (and only) metabolite species %RF: this seems dubious + end + if ~any(model.pseudoisomers(i).nHs == model_nH) + nonphysicalMetSpecies = [nonphysicalMetSpecies; model.mets(i)]; + end +end +if ~isempty(nonphysicalMetSpecies) + nonphysicalMetSpecies = unique(regexprep(nonphysicalMetSpecies,'\[\w\]','')); + if printLevel>1 + fprintf('%s\n','#H in model.metFormulas does not match any of the species calculated mol file for metabolites:') + for n=1:length(nonphysicalMetSpecies) + fprintf('%s\n',nonphysicalMetSpecies{n});%,model.metFormulas{m}); + end + end +end diff --git a/src/analysis/thermo/groupContribution/old/createGroupIncidenceMatrix_old.m b/src/analysis/thermo/groupContribution/jankowski/createGroupIncidenceMatrix_jankowski.m similarity index 100% rename from src/analysis/thermo/groupContribution/old/createGroupIncidenceMatrix_old.m rename to src/analysis/thermo/groupContribution/jankowski/createGroupIncidenceMatrix_jankowski.m diff --git a/src/analysis/thermo/groupContribution/new/createGroupIncidenceMatrix.m b/src/analysis/thermo/groupContribution/new/createGroupIncidenceMatrix.m index 665bf5bc1f..7e116e5638 100644 --- a/src/analysis/thermo/groupContribution/new/createGroupIncidenceMatrix.m +++ b/src/analysis/thermo/groupContribution/new/createGroupIncidenceMatrix.m @@ -1,4 +1,6 @@ -function combinedModel = createGroupIncidenceMatrix(model, trainingModel, param) +function trainingModel = createGroupIncidenceMatrix(model, trainingModel, mappingScore, printLevel) +% Initialize `G` matrix, and then use the python script "inchi2gv.py" to decompose each of the +% compounds that has an 'InChI' and save the decomposition as a row in the `G` matrix. % % USAGE: % @@ -13,306 +15,19 @@ % trainingModel.S: p x n stoichiometric matrix of training data % trainingModel.metKEGGID: p x 1 cell array of metabolite KEGGID % trainingModel.inchi.nonstandard: p x 1 cell array of nonstandard InChI -% trainingModel.test2CombinedModelMap: m x 1 mapping of model.mets to training data metabolites -% trainingModel.mappingScore +% trainingModel.Model2TrainingMap: m x 1 mapping of model.mets to training data metabolites +% trainingModel.mappingScore % % OUTPUT: -% combinedModel: -% combinedModel.S: k x n stoichiometric matrix of training padded with zero rows for metabolites exclusive to test data -% combinedModel.G: k x g group incicence matrix -% combinedModel.groups: g x 1 cell array of group definitions -% combinedModel.trainingMetBool k x 1 boolean indicating training metabolites in G -% combinedModel.testMetBool k x 1 boolean indicating test metabolites in G -% combinedModel.groupDecomposableBool: k x 1 boolean indicating metabolites with group decomposition -% combinedModel.inchiBool k x 1 boolean indicating metabolites with inchi - -% combinedModel.cids_that_dont_decompose: z x 1 ids of compounds that do not decomopose - -% - -if isempty(model) - model.inchi.nonstandard=[]; -end -if isempty(trainingModel) - trainingModel.inchi.nonstandard=[]; -end - -if ~exist('param','var') - param=struct(); -end -if ~isfield(param,'printLevel') - param.printLevel=0; -end - -%parameters for auto fragmentation -if ~isfield(param,'fragmentationMethod') - param.fragmentationMethod='abinito'; -end -if ~isfield(param,'modelCache') - param.modelCache=[]; -end -if ~isfield(param,'printLevel') - param.printLevel=0; -end -if ~isfield(param,'radius') - param.radius=1; -end -if ~isfield(param,'dGPredictorPath') - param.dGPredictorPath='/home/rfleming/work/sbgCloud/code/dGPredictor'; -end -if ~isfield(param,'canonicalise') - param.canonicalise=0; -end - -fprintf('Creating group incidence matrix\n'); - -switch param.fragmentationMethod - case 'abinito' - %function model = createFragmentIncidenceMatrix(inchi,radius,dGPredictorPath,canonicalise) - % model.G: k x g fragment incidence matrix - - - - %fragment each of the trainingModel inchi - if param.printLevel>0 - fprintf('%s\n','Ab inito fragmentation of each of the trainingModel inchi...') - end - trainingModelInchi=trainingModel.inchi.nonstandard; - [trainingModelFragmentedMol,trainingModelDecomposableBool,trainingModelInchiExistBool] = autoFragment(trainingModelInchi,param.radius,param.dGPredictorPath,param.canonicalise,'autoFragment_trainingModel',param.printLevel-1); - if param.printLevel>0 - fprintf('%s\n','...done.') - end - - %fragment each of the model inchi - if param.printLevel>0 - fprintf('%s\n','Ab inito fragmentation of each of the model inchi...') - end - modelInchi=model.inchi.nonstandard; - [modelFragmentedMol,modelDecomposableBool,modelInchiExistBool] = autoFragment(modelInchi,param.radius,param.dGPredictorPath,param.canonicalise,param.modelCache,param.printLevel-1); - if param.printLevel>0 - fprintf('%s\n','...done.') - end - - - - nTrainingModelMets=length(trainingModelFragmentedMol); - nModelMets=length(modelFragmentedMol); - - trainingFragmentSmiles = cell(0); - %collate the fragments in the training model - for i = 1:nTrainingModelMets - if trainingModelDecomposableBool(i) - if param.printLevel>1 - disp(trainingModelFragmentedMol(i).inchi) - trainingModelFragmentedMol(i).smilesCounts - end - if trainingModelInchiExistBool(i) - if isempty(trainingModelFragmentedMol(i).inchi) - trainingModelDecomposableBool(i)=0; - trainingModelInchiExistBool(i)=0; - else - trainingFragmentSmiles = [trainingFragmentSmiles;trainingModelFragmentedMol(i).smilesCounts.keys']; - end - else - trainingModelDecomposableBool(i)=0; - end - end - end - %set of unique fragments in the training model - uniqueTrainingFragmentSmiles = unique(trainingFragmentSmiles); - - testFragmentSmiles = cell(0); - %now the model mol - for i = 1:nModelMets - if modelDecomposableBool(i) - if param.printLevel>1 - disp(modelFragmentedMol(i).inchi) - modelFragmentedMol(i).smilesCounts - end - if modelInchiExistBool(i) - if isempty(modelFragmentedMol(i).inchi) - modelDecomposableBool(i)=0; - modelInchiExistBool(i)=0; - else - testFragmentSmiles = [testFragmentSmiles;modelFragmentedMol(i).smilesCounts.keys']; - end - else - modelDecomposableBool(i)=0; - end - end - end - %set of unique fragments in the test model - uniqueTestFragmentSmiles = unique(testFragmentSmiles); - - fragmentSmilesUniqueToTraining = setdiff(uniqueTrainingFragmentSmiles,uniqueTestFragmentSmiles); - if ~isempty(fragmentSmilesUniqueToTraining) - fprintf('%s\n',['There are ' int2str(length(fragmentSmilesUniqueToTraining)) ' fragments unique to the training model.']) - end - fragmentSmilesInCommon = intersect(uniqueTestFragmentSmiles,uniqueTrainingFragmentSmiles); - if ~isempty(fragmentSmilesInCommon) - fprintf('%s\n',['There are ' int2str(length(fragmentSmilesInCommon)) ' fragments in common between the training and test models.']) - end - fragmentSmilesUniqueToTest = setdiff(uniqueTestFragmentSmiles,uniqueTrainingFragmentSmiles); - if ~isempty(fragmentSmilesUniqueToTest) - fprintf('%s\n',['There are ' int2str(length(fragmentSmilesUniqueToTest)) ' fragments unique to the test model.']) - end - - %start based on training model - combinedModel = trainingModel; - - % combinedModel.groupDecomposableBool: k x 1 boolean indicating metabolites with group decomposition - combinedModel.groupDecomposableBool=[trainingModelDecomposableBool;modelDecomposableBool]; - combinedModel.inchiBool = [trainingModelInchiExistBool;modelInchiExistBool]; - % combinedModel.trainingMetBool k x 1 boolean indicating training metabolites in G - combinedModel.trainingMetBool = [true(nTrainingModelMets,1);false(nModelMets,1)]; - % combinedModel.testMetBool k x 1 boolean indicating test metabolites in G - combinedModel.testMetBool = [false(nTrainingModelMets,1);true(nModelMets,1)]; - - %fragments unique to the combined model - uniqueFragmentSmiles = unique([uniqueTrainingFragmentSmiles;uniqueTestFragmentSmiles]); - nFrag=length(uniqueFragmentSmiles); - nNonDecomposable=nnz(combinedModel.groupDecomposableBool==0); - - combinedModel.inchi.nonstandard = [trainingModelInchi;modelInchi]; - - nMets = nTrainingModelMets + nModelMets; - %preallocate the group incidence matrix - combinedModel.G = sparse(nMets,nFrag+nNonDecomposable); - - %use the keys to define the groups - combinedModel.groups = [uniqueFragmentSmiles;trainingModel.inchi.nonstandard(~trainingModelDecomposableBool);model.inchi.nonstandard(~modelDecomposableBool)]; - - %map each of the training model fragments to the consolidated list of fragments - d=1; - for i = 1:nTrainingModelMets - if trainingModelDecomposableBool(i) - bool = isKey(trainingModelFragmentedMol(i).smilesCounts,uniqueFragmentSmiles); - combinedModel.G(i,bool)=cell2mat(values(trainingModelFragmentedMol(i).smilesCounts)); - else - %non decomposable training molecule - combinedModel.G(i,nFrag+d) = 1; - d=d+1; - end - end - %map each of the test model fragments to the consolidated list of fragments - for i = 1:nModelMets - if modelDecomposableBool(i) - bool = isKey(modelFragmentedMol(i).smilesCounts,uniqueFragmentSmiles); - combinedModel.G(nTrainingModelMets+i,bool)=cell2mat(values(modelFragmentedMol(i).smilesCounts)); - else - if d>nNonDecomposable - error('inconsistent number of non-decomposable metabolites') - end - %non decomposable training molecule - combinedModel.G(nTrainingModelMets+i,nFrag+d) = 1; - d=d+1; - end - end - if d~=nNonDecomposable+1 - error('inconsistent number of non-decomposable metabolites') - end - - nExclusivelyTestMets = nnz(~combinedModel.trainingMetBool & combinedModel.testMetBool); - combinedModel.S = [trainingModel.S; sparse(nExclusivelyTestMets,size(trainingModel.S,2))]; % Add an empty row to S - combinedModel.mets=[trainingModel.mets;model.mets]; - combinedModel.rxns=trainingModel.rxns; - - uniqueMets = unique(combinedModel.mets); - if length(uniqueMets)~=length(combinedModel.mets) - error('combinedModel.mets is not a primary key') - end - if any(cellfun(@isempty,combinedModel.mets)) - error('combinedModel.mets is not a primary key') - end - - if 0 - save('debug_prior_to_regulariseGroupIncidenceMatrix') - - %analyse similar and duplicate metabolites - [groupM,inchiM] = regulariseGroupIncidenceMatrix(combinedModel,param.printLevel); - - %assume that metabolites with the same group decomposition are identical - test2CombinedModelM = groupM; - - %ignore duplicates within the training metabolite set - test2CombinedModelM(:,combinedModel.trainingMetBool)=0; - - %boolean identifier of duplicates - duplicatesBool = any(test2CombinedModelM,1); - - %add the test metabolites on the diagonal to preserve mapping to unique metabolites in test model - test2CombinedModelM = test2CombinedModelM + diag(combinedModel.testMetBool); - - %remove all duplicate metabolites from the arguments to the map - test2CombinedModelM = test2CombinedModelM(~duplicatesBool,:); - - combinedModel.test2CombinedModelMap=zeros(nModelMets,1); - for i=1:size(test2CombinedModelM,1) - if any(test2CombinedModelM(i,:)) - combinedModel.test2CombinedModelMap(test2CombinedModelM(i,combinedModel.testMetBool)~=0)=i; - end - end - if any(combinedModel.test2CombinedModelMap==0) - error('Mismatch in combinedModel.test2CombinedModelMap') - end - - % S: [6507×4149 double] - % rxns: {4149×1 cell} - % lb: [4149×1 double] - % cids: {672×1 cell} - % dG0_prime: [4149×1 double] - % T: [4149×1 double] - % I: [4149×1 double] - % pH: [4149×1 double] - % pMg: [4149×1 double] - % weights: [4149×1 double] - % balance: [4149×1 double] - % cids_that_dont_decompose: [43×1 double] - % mets: {6507×1 cell} - % metKEGGID: {672×1 cell} - % inchi: [1×1 struct] - % molBool: [672×1 logical] - % inchiBool: [6507×1 logical] - % compositeInchiBool: [672×1 logical] - % metFormulas: {672×1 cell} - % metCharges: [672×1 double] - % pseudoisomers: [672×1 struct] - % ub: [4149×1 double] - % dG0: [4149×1 double] - % groupDecomposableBool: [6507×1 logical] - % trainingMetBool: [6507×1 logical] - % testMetBool: [6507×1 logical] - % G: [6507×1536 double] - % groups: {1536×1 cell} - % test2CombinedModelMap: [5835×1 double] - - %save the original combined model - combinedModelOld = combinedModel; - clear combinedModel; - - %each row of the group incidence matrix should correspond to a unique metabolite - %account for any metabolites in the test dataset that are duplicated in the training dataset - combinedModel.S = combinedModelOld.S(~duplicatesBool,:); - combinedModel.mets = combinedModelOld.mets(~duplicatesBool); - combinedModel.inchi.nonstandard = combinedModelOld.inchi.nonstandard(~duplicatesBool); - combinedModel.inchiBool = combinedModelOld.inchiBool(~duplicatesBool); - combinedModel.groupDecomposableBool = combinedModelOld.groupDecomposableBool(~duplicatesBool); - combinedModel.trainingMetBool = combinedModelOld.trainingMetBool(~duplicatesBool); - combinedModel.testMetBool = combinedModelOld.testMetBool(~duplicatesBool); - combinedModel.rxns = combinedModelOld.rxns; - combinedModel.dG0 = combinedModelOld.dG0; - combinedModel.T = combinedModelOld.T; - combinedModel.pH = combinedModelOld.pH; - combinedModel.pMg = combinedModelOld.pMg; - combinedModel.I = combinedModelOld.I; - combinedModel.G = combinedModelOld.G(~duplicatesBool,:); - combinedModel.groups = combinedModelOld.groups; - combinedModel.test2CombinedModelMap = combinedModelOld.test2CombinedModelMap; - else - combinedModel.test2CombinedModelMap = (1:nModelMets)' + nTrainingModelMets; - end - case 'manual' - % +% trainingModel: +% trainingModel.S: k x n stoichiometric matrix of training + test data +% trainingModel.G: k x g group incicence matrix +% trainingModel.groups: g x 1 cell array of group definitions +% trainingModel.trainingMetBool k x 1 boolean indicating training metabolites in G +% trainingModel.testMetBool k x 1 boolean indicating test metabolites in G +% trainingModel.groupDecomposableBool: k x 1 boolean indicating metabolites with group decomposition +% trainingModel.cids_that_dont_decompose: z x 1 ids of compounds that do not decomopose +% % dG0: n x 1 standard Gibbs energy % dG0_prime: n x 1 standard transformed Gibbs energy @@ -336,142 +51,126 @@ % G: m x g group incicence matrix % groups: g x 1 cell array of group definitions % -% test2CombinedModelMap: mlt x 1 mapping of model.mets to training data metabolites +% Model2TrainingMap: mlt x 1 mapping of model.mets to training data metabolites +% + +if ~exist('printLevel','var') + printLevel=0; +end + - % Initialize `G` matrix, and then use the python script "inchi2gv.py" to decompose each of the - % compounds that has an 'InChI' and save the decomposition as a row in the `G` matrix. +fprintf('Creating group incidence matrix\n'); + +% first just run the script to get the list of group names +fullpath = which('getGroupVectorFromInchi.m'); +fullpath = regexprep(fullpath,'getGroupVectorFromInchi.m',''); + + +[status,result] = system('python2 --version'); +if status~=0 + % https://github.com/bdu91/group-contribution/blob/master/compound_groups.py + % Bin Du et al. Temperature-Dependent Estimation of Gibbs Energies Using an Updated Group-Contribution Method + [status,groupsTemp] = system(['python ' fullpath 'compound_groups.py -l']); + if status~=0 + error('createGroupIncidenceMatrix: call to compound_groups.py failed') + end +else + if 1 + inchi2gv = 'inchi2gv'; + else + inchi2gv = 'compound_groups'; + end + [status,groupsTemp] = system(['python2 ' fullpath inchi2gv '.py -l']);%seems to only work with python 2, poor coding to not check the status here! + if status~=0 + fprintf('%s\n','If you get a python error like: undefined symbol: PyFPE_jbuf, then see the following:') + fprintf('%s\n','https://stackoverflow.com/questions/36190757/numpy-undefined-symbol-pyfpe-jbuf/47703373') + error('createGroupIncidenceMatrix: call to inchi2gv.py failed') + end +end - %Match between the metabolites in the model and the metabolites in the training model - mappingScore = getMappingScores(model, trainingModel); +if isnumeric(trainingModel.cids_that_dont_decompose) + eval(['trainingModel.cids_that_dont_decompose = {' regexprep(sprintf('''C%05d''; ',trainingModel.cids_that_dont_decompose),'(;\s)$','') '};']); +end - % first just run the script to get the list of group names - fullpath = which('getGroupVectorFromInchi.m'); - fullpath = regexprep(fullpath,'getGroupVectorFromInchi.m',''); - - - [status,result] = system('python2 --version'); - if status~=0 - % https://github.com/bdu91/group-contribution/blob/master/compound_groups.py - % Bin Du et al. Temperature-Dependent Estimation of Gibbs Energies Using an Updated Group-Contribution Method - [status,groupsTemp] = system(['python ' fullpath 'compound_groups.py -l']); - if status~=0 - error('createGroupIncidenceMatrix: call to compound_groups.py failed') - end +groups = regexp(groupsTemp,'\n','split')'; +clear groupsTemp; +trainingModel.groups = groups(~cellfun(@isempty, groups)); +trainingModel.G = sparse(length(trainingModel.metKEGGID), length(trainingModel.groups)); +trainingModel.groupDecomposableBool = false(size(trainingModel.metKEGGID)); +trainingModel.testMetBool = false(size(trainingModel.metKEGGID)); +for i = 1:length(trainingModel.metKEGGID) + [score, modelRow] = max(full(mappingScore(:,i))); + if score == 0 + inchi = trainingModel.inchi.nonstandard{i}; + else + % if there is a match to the model, use the InChI from there to be consistent with later transforms + inchi = model.inchi.nonstandard{modelRow}; + trainingModel.testMetBool(i)=1; + end + + % There might be compounds in the model but not in the training data that also cannot be + % decomposed, we need to take care of them too (e.g. DMSO - C11143) + if isempty(inchi) || any(ismember(trainingModel.metKEGGID{i}, trainingModel.cids_that_dont_decompose)) + trainingModel.G(:, end+1) = 0; % add a unique 1 in a new column for this undecomposable compound + trainingModel.G(i, end) = 1; + trainingModel.groupDecomposableBool(i) = false; + else + group_def = getGroupVectorFromInchi(inchi); + if length(group_def) == length(trainingModel.groups) + trainingModel.G(i, 1:length(group_def)) = group_def; + trainingModel.groupDecomposableBool(i) = true; + elseif isempty(group_def) + warning(['createGroupIncidenceMatrix: undecomposable inchi: ' inchi]) + trainingModel.G(:, end+1) = 0; % add a unique 1 in a new column for this undecomposable compound + trainingModel.G(i, end) = 1; + trainingModel.groupDecomposableBool(i) = false; + trainingModel.cids_that_dont_decompose = [trainingModel.cids_that_dont_decompose; trainingModel.metKEGGID{i}]; else - if 1 - inchi2gv = 'inchi2gv'; - else - inchi2gv = 'compound_groups'; - end - [status,groupsTemp] = system(['python2 ' fullpath inchi2gv '.py -l']);%seems to only work with python 2, poor coding to not check the status here! - if status~=0 - fprintf('%s\n','If you get a python error like: undefined symbol: PyFPE_jbuf, then see the following:') - fprintf('%s\n','https://stackoverflow.com/questions/36190757/numpy-undefined-symbol-pyfpe-jbuf/47703373') - error('createGroupIncidenceMatrix: call to inchi2gv.py failed') - end + fprintf('InChI = %s\n', inchi); + fprintf('*************\n%s\n', getGroupVectorFromInchi(inchi, printLevel)); + error(sprintf('ERROR: while trying to decompose compound C%05d', trainingModel.metKEGGID{i})); end - - if isnumeric(trainingModel.cids_that_dont_decompose) - eval(['trainingModel.cids_that_dont_decompose = {' regexprep(sprintf('''C%05d''; ',trainingModel.cids_that_dont_decompose),'(;\s)$','') '};']); - end - - groups = regexp(groupsTemp,'\n','split')'; - clear groupsTemp; - trainingModel.groups = groups(~cellfun(@isempty, groups)); - trainingModel.G = sparse(length(trainingModel.metKEGGID), length(trainingModel.groups)); - trainingModel.groupDecomposableBool = false(size(trainingModel.metKEGGID)); - trainingModel.testMetBool = false(size(trainingModel.metKEGGID)); - trainingModel.trainingMetBool = false(size(trainingModel.metKEGGID)); - for i = 1:length(trainingModel.metKEGGID) - trainingModel.trainingMetBool(i)=1; - [score, modelRow] = max(full(mappingScore(:,i))); - if score == 0 - inchi = trainingModel.inchi.nonstandard{i}; - trainingModel.testMetBool(i)=0; - - else - % if there is a match to the model, use the InChI from there to be consistent with later transforms - inchi = model.inchi.nonstandard{modelRow}; - trainingModel.testMetBool(i)=1; - end - - % There might be compounds in the model but not in the training data that also cannot be - % decomposed, we need to take care of them too (e.g. DMSO - C11143) - if isempty(inchi) || any(ismember(trainingModel.metKEGGID{i}, trainingModel.cids_that_dont_decompose)) - trainingModel.G(:, end+1) = 0; % add a unique 1 in a new column for this undecomposable compound - trainingModel.G(i, end) = 1; - trainingModel.groupDecomposableBool(i) = false; - else - group_def = getGroupVectorFromInchi(inchi); - if length(group_def) == length(trainingModel.groups) - trainingModel.G(i, 1:length(group_def)) = group_def; - trainingModel.groupDecomposableBool(i) = true; - elseif isempty(group_def) - warning(['createGroupIncidenceMatrix: undecomposable inchi: ' inchi]) - trainingModel.G(:, end+1) = 0; % add a unique 1 in a new column for this undecomposable compound - trainingModel.G(i, end) = 1; - trainingModel.groupDecomposableBool(i) = false; - trainingModel.cids_that_dont_decompose = [trainingModel.cids_that_dont_decompose; trainingModel.metKEGGID{i}]; - else - fprintf('InChI = %s\n', inchi); - fprintf('*************\n%s\n', getGroupVectorFromInchi(inchi, param.printLevel)); - error(sprintf('ERROR: while trying to decompose compound C%05d', trainingModel.metKEGGID{i})); - end - end - end - trainingModel.G = sparse(trainingModel.G); - - - trainingModel.test2CombinedModelMap = zeros(size(model.mets)); - done = {}; - for n = 1:length(model.mets) - % first find all metabolites with the same name (can be in different compartments) - met = model.mets{n}(1:end-3); - if any(strcmp(met, done)) % this compound was already mapped - continue; - end - done = [done; {met}]; - - inchi = model.inchi.nonstandard{n}; - - [score, trainingRow] = max(full(mappingScore(n,:))); - if score == 0 % this compound is not in the training data - trainingRow = size(trainingModel.G, 1) + 1; - trainingModel.trainingMetBool(trainingRow)=1; - trainingModel.S(trainingRow, :) = 0; % Add an empty row to S - - % Add a row in G for this compound, either with its group vector, - % or with a unique 1 in a new column dedicate to this compound - trainingModel.G(trainingRow, :) = 0; - group_def = getGroupVectorFromInchi(inchi); - if length(group_def) == length(trainingModel.groups) - trainingModel.G(trainingRow, 1:length(group_def)) = group_def; - trainingModel.groupDecomposableBool(trainingRow) = true; - elseif isempty(group_def) - trainingModel.G(:, end+1) = 0; % add a unique 1 in a new column for this undecomposable compound - trainingModel.G(trainingRow, end) = 1; - trainingModel.groupDecomposableBool(trainingRow) = false; - else - error('The length of the group vector is different from the number of groups'); - end - end + end +end +trainingModel.G = sparse(trainingModel.G); + +trainingModel.Model2TrainingMap = zeros(size(model.mets)); +done = {}; + +for n = 1:length(model.mets) + % first find all metabolites with the same name (can be in different compartments) + met = model.mets{n}(1:end-3); + if any(strcmp(met, done)) % this compound was already mapped + continue; + end + done = [done; {met}]; + metIdx = strmatch([met '['], model.mets); + inchi = model.inchi.nonstandard{n}; + + [score, trainingRow] = max(full(mappingScore(n,:))); + if score == 0 % this compound is not in the training data + trainingRow = size(trainingModel.G, 1) + 1; + trainingModel.S(trainingRow, :) = 0; % Add an empty row to S + trainingModel.testMetBool(i)=1; + % Add a row in G for this compound, either with its group vector, + % or with a unique 1 in a new column dedicated to this compound + trainingModel.G(trainingRow, :) = 0; + group_def = getGroupVectorFromInchi(inchi); + if length(group_def) == length(trainingModel.groups) + trainingModel.G(trainingRow, 1:length(group_def)) = group_def; + trainingModel.groupDecomposableBool(trainingRow) = true; - metIdx = contains(model.mets,[met '[']); - trainingModel.test2CombinedModelMap(metIdx) = trainingRow; % map the model met to this NIST compound + elseif isempty(group_def) + trainingModel.G(:, end+1) = 0; % add a unique 1 in a new column for this undecomposable compound + trainingModel.G(trainingRow, end) = 1; + trainingModel.groupDecomposableBool(trainingRow) = false; + else + error('The length of the group vector is different than the number of groups'); end - %new variable name for combined model - combinedModel = trainingModel; - otherwise - error(['Unrecognised param.fragmentationMethod =' param.fragmentationMethod]) + end + trainingModel.Model2TrainingMap(metIdx) = trainingRow; % map the model met to this NIST compound end -boolGroup=~any(combinedModel.G,1); -if any(boolGroup) - error([int2str(nnz(boolGroup)) ' groups without any corresponding metabolite']) -end -boolMet=~any(combinedModel.G,2); -if any(boolMet) - error([int2str(nnz(boolMet)) ' metabolites without any corresponding group']) -end - - +[m,g]=size(trainingModel.G); +trainingModel.trainingMetBool=false(m,1); +trainingModel.trainingMetBool(1:length(trainingModel.metKEGGID),1)=1; \ No newline at end of file diff --git a/src/analysis/thermo/vonBertalanffy/addInchiToModel.m b/src/analysis/thermo/inchi/addInchiToModel.m similarity index 100% rename from src/analysis/thermo/vonBertalanffy/addInchiToModel.m rename to src/analysis/thermo/inchi/addInchiToModel.m diff --git a/src/dataIntegration/chemoInformatics/molecularWeight/getMolecularWeight.m b/src/analysis/thermo/inchi/getMolecularWeight.m similarity index 100% rename from src/dataIntegration/chemoInformatics/molecularWeight/getMolecularWeight.m rename to src/analysis/thermo/inchi/getMolecularWeight.m diff --git a/src/analysis/thermo/protons/old/inchi2mol.m b/src/analysis/thermo/inchi/inchi2mol.m similarity index 100% rename from src/analysis/thermo/protons/old/inchi2mol.m rename to src/analysis/thermo/inchi/inchi2mol.m diff --git a/src/analysis/thermo/inchi/createInChIStruct.m b/src/analysis/thermo/inchi/new/createInChIStruct.m similarity index 100% rename from src/analysis/thermo/inchi/createInChIStruct.m rename to src/analysis/thermo/inchi/new/createInChIStruct.m diff --git a/src/analysis/thermo/inchi/getFormulaAndChargeFromInChI.m b/src/analysis/thermo/inchi/new/getFormulaAndChargeFromInChI.m similarity index 100% rename from src/analysis/thermo/inchi/getFormulaAndChargeFromInChI.m rename to src/analysis/thermo/inchi/new/getFormulaAndChargeFromInChI.m diff --git a/src/analysis/thermo/inchi/old/createInChIStruct.m b/src/analysis/thermo/inchi/old/createInChIStruct.m new file mode 100644 index 0000000000..5b6d2e1375 --- /dev/null +++ b/src/analysis/thermo/inchi/old/createInChIStruct.m @@ -0,0 +1,66 @@ +function inchiStruct = createInChIStruct(mets,sdfFileName) +% Converts metabolite structures in SDF to InChI strings with OpenBabel, +% and maps InChIs to mets. +% +% inchi = createInChIStruct(mets,sdfFileName) +% +% INPUTS +% mets m x 1 cell array of metabolite identifiers (e.g., BiGG +% abbreviations). +% sdfFileName SDF with structures of metabolites in mets. Metabolite +% identifiers in the SDF are assumed to be the same as in +% mets. +% +% OUTPUTS +% inchi Structure with following fields: +% .standard Standard InChIs with no isotope, stereo +% or charge layers. +% .standardWithStereo Standard InChIs with stereo layers. +% .standardWithStereoAndCharge Standard InChIs with stereo and charge +% layers +% .nonstandard Nonstandard InChI with all layers. +% +% Hulda SH, Nov. 2012 + +% Convert SDF to InChIs +[standard,metList1] = sdf2inchi(sdfFileName,'-xtT/noiso/nochg/nostereo'); + +[standardWithStereo,metList2] = sdf2inchi(sdfFileName,'-xtT/noiso/nochg'); +if ~all(strcmp(metList1,metList2)) + error('Error creating InChI structure.'); +end + +[standardWithStereoAndCharge,metList3] = sdf2inchi(sdfFileName,'-xtT/noiso'); +if ~all(strcmp(metList1,metList3)) + error('Error creating InChI structure.'); +end + +[nonstandard,metList4] = sdf2inchi(sdfFileName,'-xtFT/noiso'); +if ~all(strcmp(metList1,metList4)) + error('Error creating InChI structure.'); +end + +% Map InChIs to mets +inchiStruct.standard = cell(size(mets)); +inchiStruct.standardWithStereo = cell(size(mets)); +inchiStruct.standardWithStereoAndCharge = cell(size(mets)); +inchiStruct.nonstandard = cell(size(mets)); + +mets = reshape(mets,length(mets),1); +if ischar(mets) + mets = strtrim(cellstr(mets)); +end +if iscell(mets) + mets = regexprep(mets,'(\[\w\])$',''); % Remove compartment assignment +end +if isnumeric(mets) + mets = strtrim(cellstr(num2str(mets))); +end + +for i = 1:length(metList1) + inchiStruct.standard(ismember(mets,metList1{i})) = standard(i); + inchiStruct.standardWithStereo(ismember(mets,metList1{i})) = standardWithStereo(i); + inchiStruct.standardWithStereoAndCharge(ismember(mets,metList1{i})) = standardWithStereoAndCharge(i); + inchiStruct.nonstandard(ismember(mets,metList1{i})) = nonstandard(i); +end + diff --git a/src/analysis/thermo/inchi/old/getFormulaAndChargeFromInChI.m b/src/analysis/thermo/inchi/old/getFormulaAndChargeFromInChI.m new file mode 100644 index 0000000000..030065bdaf --- /dev/null +++ b/src/analysis/thermo/inchi/old/getFormulaAndChargeFromInChI.m @@ -0,0 +1,57 @@ +function [formula, nH, charge] = getFormulaAndChargeFromInChI(inchi) + +% [formula,charge] = getFormulaAndChargeFromInChI(inchi) +% +% INPUT +% inchi.......Nonstandard IUPAC InChI for a particular pseudoisomer of a +% metabolite +% +% OUTPUTS +% formula....The chemical formula for the input pseudoisomer +% charge.....The charge on the input pseudoisomer + + +layers = regexp(inchi,'/','split'); +f1 = layers{2}; % Fully protonated formula + +p = {}; +if ~isempty(strmatch('p',layers)) + p = layers{strmatch('p',layers)}; % nH to add or subtract +end + +q = {}; +if ~isempty(strmatch('q',layers)) + q = layers{strmatch('q',layers)}; % charge +end + +if ~isempty(q) + charge = str2double(q(2:end)); +else + charge = 0; +end + +f1_nH = numAtomsOfElementInFormula(f1,'H'); % nH in fully protonated formula +if ~isempty(p) + nH = f1_nH + str2double(p(2:end)); % nH in pseudoisomer formula +else + nH = f1_nH; +end + +formula = regexprep(f1,'H[a-z]*[0-9]*',''); % Remove all H from fully protonated formula +if nH == 1 + formula = [formula 'H']; +elseif nH > 1 + formula = [formula 'H' num2str(nH)]; % Add appropriate nH back in to create pseudoisomer formula +elseif nH < 0 + error('Negative number of H in formula.') % Should never get here +end + +% In case there is Hg in formula +f1_nHg = numAtomsOfElementInFormula(f1,'Hg'); +if f1_nHg == 1 + formula = [formula 'Hg']; +elseif f1_nHg > 1 + formula = [formula 'Hg' num2str(f1_nHg)]; +end + +end diff --git a/src/analysis/thermo/inchi/sdf2inchi.m b/src/analysis/thermo/inchi/sdf2inchi.m deleted file mode 100644 index c263bb9217..0000000000 --- a/src/analysis/thermo/inchi/sdf2inchi.m +++ /dev/null @@ -1,62 +0,0 @@ -function [inchi, metList] = sdf2inchi(sdfFileName, options) -% Converts metabolite structures in an SDF to a cell array of InChI -% strings with OpenBabel. -% -% USAGE: -% -% [inchi, metList] = sdf2inchi(sdfFileName, options) -% -% INPUT: -% sdfFileName: Path to SDF file. -% -% OPTIONAL INPUTS: -% options: Write options for InChI strings. See InChI documentation -% for details. If no options are specified the function will -% output standard InChI. -% -% OUTPUTS: -% inchi: Cell array of InChI strings for metabolites in the SDF file. -% metList: Cell array of metabolite identifiers (first line of each -% molfile in SDF). Will be empty unless write option `t` is -% used (i.e., options >= '-xt'). -% -% .. Author: - Hulda SH, Nov. 2012 - -if ~strcmp(sdfFileName(end-3:end),'.sdf') % Check inputs - sdfFileName = [sdfFileName '.sdf']; -end - -if ~exist('options','var') - options = []; -end -if ~isempty(options) - options = [' ' strtrim(options)]; -end - -%babelcmd='obabel'; -babelcmd='babel'; - -% Convert to InChI with OpenBabel -[success,result] = system([babelcmd ' ' sdfFileName ' -oinchi' options]); - -% Parse output from OpenBabel -if success == 0 - result = regexp(result,'InChI=[^\n]*\n','match'); - result = result'; - result = strtrim(result); - - [inchi,metList] = strtok(result); - inchi = strtrim(inchi); - metList = strtrim(metList); -else - [success,result] = system([babelcmd ' ' sdfFileName ' -oinchi' options]) - fprintf('%s\n','If you get a ''not found'' message from the call to Babel, make sure that Matlab''s LD_LIBRARY_PATH is edited to include correct system libraries. See initVonBertylanffy') - error('Conversion to InChI not successful. Make sure OpenBabel is installed correctly.\n') -end - -if size(inchi,2) > size(inchi,1) - inchi = inchi'; -end -if size(metList,2) > size(metList,1) - metList = metList'; -end diff --git a/src/analysis/thermo/vonBertalanffy/addPseudoisomersToModel.m b/src/analysis/thermo/protons/addPseudoisomersToModel.m similarity index 100% rename from src/analysis/thermo/vonBertalanffy/addPseudoisomersToModel.m rename to src/analysis/thermo/protons/addPseudoisomersToModel.m diff --git a/src/analysis/thermo/protons/new/inchi2mol.m b/src/analysis/thermo/protons/new/inchi2mol.m deleted file mode 100644 index ec50745cef..0000000000 --- a/src/analysis/thermo/protons/new/inchi2mol.m +++ /dev/null @@ -1,85 +0,0 @@ -function successbool = inchi2mol(inchis, filenames, outputdir, overwrite) -% Convert InChI strings to mol files using OpenBabel. Compatible with -% Windows and Unix. -% -% USAGE: -% -% successbool = inchi2mol(inchis, filenames, outputdir, overwrite) -% -% INPUTS: -% inchis: `n x 1` Cell array of InChI strings -% filenames: `n x 1` Cell array of mol file names without file extension. Default is {'1' ;'2' ;...} -% outputdir: Directory for mol files. Default is 'CurrentDirectory\molfiles'. -% overwrite: 0, [1]. Specify whether to overwrite existing mol files in outputdir. -% -% OUTPUTS: -% successbool: `n x 1` logical vector with 1 at indices corresponding to -% inchis that were successfully converted to mol files and 0 elsewhere. -% -% .. Author: - Hulda SH Feb. 2011 - -if ~iscell(inchis) && size(inchis,1) == 1 % Format input and set defaults, If a single InChI is input as string - inchis = {inchis}; -end - -if ~exist('filenames','var') || isempty(filenames) % Create cell array of default filenames - filenames = 1:length(inchis); - filenames = filenames'; - filenames = num2str(filenames); - filenames = mat2cell(filenames,ones(1,size(filenames,1)),size(filenames,2)); - filenames = deblank(filenames); -end - -if ~iscell(filenames) && size(filenames,1) == 1 % If a single file name is input as string - filenames = {filenames}; -end - -if size(inchis) ~= size(filenames) - error('inchis and filenames should be the same size.'); -end - -% Set other defaults -if ~exist('outputdir','var') || isempty(outputdir) - if ~exist([pwd '\molfiles'],'dir') - mkdir(pwd,'molfiles') - end - outputdir = [pwd '\molfiles']; -end - -if ~exist('overwrite','var') || isempty(overwrite) - overwrite = 0; -end - -% Begin conversion -if ~exist('outputdir','dir') % Create output directory - mkdir(outputdir) -end - -system(['cd ' outputdir]); % Make OS cd to outputdir -cd(outputdir); % Make Matlab cd to outputdir - -% List existing mol files in outputdir -d = dir(outputdir); -existingMolFiles = cat(2,d.name); -existingMolFiles = ['.mol' existingMolFiles]; - -nInchis = length(inchis); -successbool = false(nInchis,1); - -for n = 1:nInchis - if ~isempty(inchis{n}) - if isempty(strfind(existingMolFiles,['.mol' filenames{n} '.mol'])) || overwrite == 1 % If mol file of same name does not already exist or if it should be overwritten - system(['echo ' inchis{n} ' > inchi.inchi']); % Temporary place holder for inchi. Overwritten in each loop. - system(['babel inchi.inchi ' filenames{n} '.mol']); % Convert InChI to mol file - - f = dir([outputdir '\' filenames{n} '.mol']); - if f.bytes > eps % If mol file is empty the conversion failed - successbool(n) = true; - else - delete([outputdir '\' filenames{n} '.mol']) % Delete empty mol files - end - end - end -end - -delete([outputdir '\inchi.inchi']) diff --git a/src/analysis/thermo/vonBertalanffy/assignQuantDir.m b/src/analysis/thermo/thermoDirectionality/assignQuantDir.m similarity index 100% rename from src/analysis/thermo/vonBertalanffy/assignQuantDir.m rename to src/analysis/thermo/thermoDirectionality/assignQuantDir.m diff --git a/src/analysis/thermo/vonBertalanffy/thermoConstrainFluxBounds.m b/src/analysis/thermo/thermoDirectionality/thermoConstrainFluxBounds.m similarity index 100% rename from src/analysis/thermo/vonBertalanffy/thermoConstrainFluxBounds.m rename to src/analysis/thermo/thermoDirectionality/thermoConstrainFluxBounds.m diff --git a/src/dataIntegration/chemoInformatics/addMetInfoInCBmodel.m b/src/dataIntegration/chemoInformatics/addMetInfoInCBmodel.m new file mode 100644 index 0000000000..eee5fc11fb --- /dev/null +++ b/src/dataIntegration/chemoInformatics/addMetInfoInCBmodel.m @@ -0,0 +1,116 @@ +function [newModel, hasEffect] = addMetInfoInCBmodel(model, inputData, replace) +% Integrates metabolite data from an external file and incorporates it into +% the COBRA model. +% +% USAGE: +% +% model = addMetInfoInCBmodel(model, inputData) +% +% INPUTS: +% model: COBRA model with following fields: +% +% * .S - The m x n stoichiometric matrix for the +% metabolic network. +% * .mets - An m x 1 array of metabolite identifiers. +% inputData: COBRA model with following fields: +% +% OPTIONAL INPUTS: +% replace: If the new ID should replace an existing ID, this +% logical value indicates so (default: false). +% +% OUTPUTS: COBRA model with updated identifiersCOBRA model with +% the identifiers updated. + +if nargin < 3 || isempty(replace) + replace = false; +end + +hasEffect = false; + +if isfile(inputData) + dbData = readtable(inputData); +elseif isstruct(inputData) + dbData = readtable(inputData); +end + +% Get data from model and external source +sources = {'KEGG', 'HMDB', 'ChEBI', 'PubChem', 'SMILES', 'InChI'}; +fields = fieldnames(model); +newDataVariableNames = dbData.Properties.VariableNames; +metsInModel = regexprep(model.mets, '(\[\w\])', ''); + +for i = 1:size(sources, 2) + + % Get the corresponding field for a database in the model + fieldInModelBool = ~cellfun(@isempty, regexpi(fields, sources{i})); + if sum(fieldInModelBool) > 1 + metDataInModelBool = ~cellfun(@isempty, regexpi(fields, 'met')); + fieldInModelBool = fieldInModelBool & metDataInModelBool; + end + % Create the field if it does not exist + if sum(fieldInModelBool) == 0 + model.(['met' sources{i}]) = cell(size(metsInModel, 1), 1); + fields = fieldnames(model); + fieldInModelBool = ~cellfun(@isempty, regexpi(fields, sources{i})); + end + + % Get the corresponding field for a database in the data + fieldInDataBool = ~cellfun(@isempty, regexpi(newDataVariableNames, sources{i})); + if sum(fieldInDataBool) > 1 + metDataInDataBool = ~cellfun(@isempty, regexpi(newDataVariableNames, 'met')); + inchiKeyInDataBool = cellfun(@isempty, regexpi(newDataVariableNames, 'inchikey')); + fieldInDataBool = fieldInDataBool & (metDataInDataBool | inchiKeyInDataBool); + end + + if sum(fieldInModelBool) == 1 && sum(fieldInDataBool) == 1 + + % Identify the correct idx per metabolite + for j = 1:size(dbData.(newDataVariableNames{1}), 1) + + % Fix for numeric values omitting 0 and NaN + if isnumeric(dbData.(newDataVariableNames{fieldInDataBool})(j)) + if ~isnan(dbData.(newDataVariableNames{fieldInDataBool})(j)) && dbData.(newDataVariableNames{fieldInDataBool})(j) ~= 0 + data2add = num2str(dbData.(newDataVariableNames{fieldInDataBool})(j)); + else + data2add = []; + end + else + data2add = dbData.(newDataVariableNames{fieldInDataBool}){j}; + end + idx = strmatch(dbData.(newDataVariableNames{1}){j}, metsInModel, 'exact'); + + % Fill the data + if ~isempty(idx) && replace && ~isempty(data2add) + + % Add the ID + for k = 1:size(idx, 1) + model.(fields{fieldInModelBool})(idx(k)) = data2add; + end + if ~hasEffect + hasEffect = true; + end + + elseif ~isempty(idx) && ~replace && ~isempty(data2add) + + % Only add data on empty cells + idxBool = cellfun(@isempty, model.(fields{fieldInModelBool})(idx)); + if any(idxBool) + idx = idx(idxBool); + % Add the ID + for k = 1:size(idx, 1) + model.(fields{fieldInModelBool}){idx(k)} = data2add; + end + if ~hasEffect + hasEffect = true; + end + end + + end + + end + elseif sum(fieldInModelBool) > 1 || sum(fieldInDataBool) > 1 + error(['Rename the data, nameTag "' sources{i} '" was found more than 1 time']) + end +end + +newModel = model; \ No newline at end of file diff --git a/src/dataIntegration/chemoInformatics/checkMetIdsInModel.m b/src/dataIntegration/chemoInformatics/checkMetIdsInModel.m new file mode 100644 index 0000000000..58d82e3ca7 --- /dev/null +++ b/src/dataIntegration/chemoInformatics/checkMetIdsInModel.m @@ -0,0 +1,75 @@ +function metsIdsStatistics = checkMetIdsInModel(model) +% Prints the statistics of the metabolites id in the a model. +% +% USAGE: +% +% metsIdsStatistics = checkMetIdsInModel(model) +% +% INPUTS: +% model: COBRA model with following fields: +% +% * .mets - An m x 1 array of metabolite identifiers. +% Should match metabolite identifiers in +% RXN. +% OUTPUTS: +% metsIdsStatistics: Information such as the number of metabolites per +% database, coverage, or the number of times another +% ID is shared for the same metabolite. + +fields = fieldnames(model); +sources = {'kegg', 'hmdb', 'chebi', 'pubchem', 'smiles', 'inchi'}; +mets = regexprep(model.mets, '(\[\w\])', ''); +umets = unique(mets); + +for i = 1:length(sources) + + fieldInModelBool = ~cellfun(@isempty, regexpi(fields, sources{i})); + if any(fieldInModelBool) + + if sum(fieldInModelBool) == 1 + sourceIdsPerMet = model.(fields{fieldInModelBool}); + else + dbFields = fields(fieldInModelBool); + sourceIdsPerMet = model.(dbFields{~cellfun(@isempty, regexpi(dbFields, 'met'))}); + end + + + sourceData.metWithIdBool = ~cellfun(@isempty, sourceIdsPerMet); + sourceData.metWithIdTotal = sum(sourceData.metWithIdBool); + sourceData.DBcoverage = (sourceData.metWithIdTotal * 100) / size(mets, 1); + + % unique metabolites + dbMets = sourceIdsPerMet(sourceData.metWithIdBool); + uDbMets = unique(dbMets); + uniqueBool = false(size(mets, 1), 1); + for j = 1:size(uDbMets, 1) + idx = find(ismember(sourceIdsPerMet, uDbMets{j})); + uniqueBool(idx(1)) = true; + end + + sourceData.uniqueMetWithIdBool = uniqueBool; + sourceData.uniqueMetWithIdTotal = sum(sourceData.uniqueMetWithIdBool); + sourceData.uniqueIdDBcoverage = (sourceData.uniqueMetWithIdTotal * 100) / size(umets, 1); + + switch sources{i} + + case 'kegg' + metsIdsStatistics.KEEG = sourceData; + case 'hmdb' + metsIdsStatistics.HMDB = sourceData; + case 'chebi' + metsIdsStatistics.ChEBi = sourceData; + case 'pubchem' + metsIdsStatistics.PubChem = sourceData; + case 'smiles' + metsIdsStatistics.SMILES = sourceData; + case 'inchi' + metsIdsStatistics.InChI = sourceData; + end + + clear DBdata dbIdsPerMet + end +end + +end + diff --git a/src/dataIntegration/chemoInformatics/editChemicalFormula.m b/src/dataIntegration/chemoInformatics/editChemicalFormula.m new file mode 100644 index 0000000000..4700258498 --- /dev/null +++ b/src/dataIntegration/chemoInformatics/editChemicalFormula.m @@ -0,0 +1,76 @@ +function newFormula = editChemicalFormula(metFormula, addOrRemove) +% For each instance, removes non-chemical characters from the chemical +% formula and replaces them with a R group. Removes atoms from the formula +% as well. Produces a chemical formula from a string of atoms. +% +% USAGE: +% +% [metFormula] = cobraFormulaToChemFormula(metReconFormula) +% +% INPUTS: +% metReconFormula: An n x 1 array of metabolite Recon formulas +% OPTIONAL INPUTS: +% addOrRemove: A struct array containing: +% *.elements - element to edit +% *.times - vector indicated the times the +% element will be deleted (negative) or +% added (positive) +% +% OUTPUTS: +% chemicalFormula: A chemical formula for a metabolite + +if nargin < 2 || isempty(addOrRemove) + addOrRemove = []; +end + +% Update R groups +metFormula = regexprep(char(metFormula), 'X|Y|*|FULLR', 'R'); + +% Rearrange the formulas (e.g., from RCRCOC to C3OR2) + +% Count the atoms +[elemetList, ~ , elemetEnd] = regexp(metFormula, ['[', 'A':'Z', '][', 'a':'z', ']?'], 'match'); +[num, numStart] = regexp(metFormula, '\d+', 'match'); +numList = ones(size(elemetList)); +idx = ismember(elemetEnd + 1, numStart); +numList(idx) = cellfun(@str2num, num); + +% Combine atoms if neccesary +uniqueElemetList = unique(elemetList); +for j = 1:size(uniqueElemetList, 2) + atomBool = ismember(elemetList, uniqueElemetList{j}); + if sum(atomBool) > 1 + dataToKeep = find(atomBool); + numList(dataToKeep(1)) = sum(numList(atomBool)); + numList(dataToKeep(2:end)) = []; + elemetList(dataToKeep(2:end)) = []; + end +end + +% Delete/add atoms if neccesary +if ~isempty(addOrRemove) + for i = 1:length(addOrRemove.elements) + elemetBool = ismember(elemetList, addOrRemove.elements(i)); + if any(elemetBool) + numList(elemetBool) = numList(elemetBool) + addOrRemove.times(i); + else + elemetList{size(elemetList, 2) + 1} = addOrRemove.elements{i}; + numList(size(elemetList, 2)) = addOrRemove.times(i); + end + end +end + +% Sort atoms +[elemetList, ia] = sort(elemetList); + +% Make formula +newFormula = []; +for j = 1:size(elemetList, 2) + if numList(ia(j)) == 1 + newFormula = [newFormula elemetList{j}]; + else + newFormula = [newFormula elemetList{j} num2str(numList(ia(j)))]; + end +end + +end diff --git a/src/dataIntegration/chemoInformatics/generateChemicalDatabase.m b/src/dataIntegration/chemoInformatics/generateChemicalDatabase.m new file mode 100644 index 0000000000..68d739e25e --- /dev/null +++ b/src/dataIntegration/chemoInformatics/generateChemicalDatabase.m @@ -0,0 +1,1095 @@ +function [info, newModel] = generateChemicalDatabase(model, options) +% This function uses the metabolite identifiers in the model to compare +% them and save the identifiers with the best score in MDL MOL format +% and/or inchi and simles and jpeg if it's installed cxcalc and openBabel. +% The obtained MDL MOL files will serve as the basis for creating the MDL +% RXN files that represent a metabolic reaction and can only be written if +% there is a MDL MOL file for each metabolite in a metabolic reaction. +% If JAVA is installed, it also atom maps the metabolic reactions +% with an MDL RXN file. +% +% USAGE: +% +% [info, newModel] = generateChemicalDatabase(model, options) +% +% INPUTS: +% +% model: COBRA model with following fields: +% +% * .S - The m x n stoichiometric matrix for the +% metabolic network. +% * .rxns - An n x 1 array of reaction identifiers. +% * .mets - An m x 1 array of metabolite identifiers. +% * .metFormulas - An m x 1 array of metabolite chemical formulas. +% * .metinchi - An m x 1 array of metabolite identifiers. +% * .metsmiles - An m x 1 array of metabolite identifiers. +% * .metKEGG - An m x 1 array of metabolite identifiers. +% * .metHMDB - An m x 1 array of metabolite identifiers. +% * .metPubChem - An m x 1 array of metabolite identifiers. +% * .metCHEBI - An m x 1 array of metabolite identifiers. +% +% options: A structure containing all the arguments for the function: +% +% * .outputDir: The path to the directory containing the RXN +% files with atom mappings (default: current directory) +% * .printlevel: Verbose level +% * .standardisationApproach: String containing the type of +% standardisation for the molecules (default: 'explicitH' +% if openBabel is installed, otherwise default: 'basic'): +% - explicitH: Normal chemical graphs; +% - implicitH: Hydrogen suppressed chemical graph; +% - neutral: Chemical graphs with protonated molecules; +% - basic: Update the header. +% * .keepMolComparison: Logic value for comparing MDL MOL +% files from various sources (default: FALSE) +% * .onlyUnmapped: Logic value to select create only unmapped +% MDL RXN files (default: FALSE). +% * .adjustToModelpH: Logic value used to determine whether a +% molecule's pH must be adjusted in accordance with the +% COBRA model. (default: TRUE, requires MarvinSuite). +% * .addDirsToCompare: Cell(s) with the path to directory to +% an existing database (default: empty). +% * .dirNames: Cell(s) with the name of the directory(ies) +% (default: empty). +% * .debug: Logical value used to determine whether or not +% the results of different points in the function will be +% saved for debugging (default: empty). +% +% OUTPUTS: +% +% newModel: A new model with the comparison and if onlyUnmapped = +% false, the informaton about the bonds broken and formed +% as well as the bond enthalpies for each metabolic +% reaction. +% info: A diary of the database generation process + +%% 1. Initialise data and set default variables + +if ~isfield(options, 'outputDir') + outputDir = [pwd filesep]; +else + % Make sure input path ends with directory separator + outputDir = [regexprep(options.outputDir,'(/|\\)$',''), filesep]; +end +metDir = [outputDir 'mets']; +rxnDir = [outputDir 'rxns']; + +modelFields = fieldnames(model); + +if ~isfield(options, 'debug') + options.debug = false; +end +if ~isfield(options, 'printlevel') + options.printlevel = 1; +end +if ~isfield(options, 'standardisationApproach') + options.standardisationApproach = 'explicitH'; +end +if ~isfield(options, 'keepMolComparison') + options.keepMolComparison = false; +end +if ~isfield(options, 'onlyUnmapped') + options.onlyUnmapped = false; +end +if ~isfield(options, 'adjustToModelpH') + options.adjustToModelpH = true; +end +if isfield(options, 'dirsToCompare') + dirsToCompare = true; + for i = 1:length(options.dirsToCompare) + options.dirsToCompare{i} = [regexprep(options.dirsToCompare{i},'(/|\\)$',''), filesep]; + if ~isfolder(options.dirsToCompare) + display([options.dirsToCompare{i} ' is not a directory']) + options.dirsToCompare{i} = []; + end + end +else + options.dirsToCompare = []; + dirsToCompare = false; +end +if ~isfield(options, 'dirNames') + for i = 1:length(options.dirsToCompare) + options.dirNames{i, 1} = ['localDir' num2str(i)]; + end +else + if isrow(options.dirNames) + options.dirNames = options.dirNames'; + end +end + +% Check if ChemAxon and openBabel are installed +[cxcalcInstalled, ~] = system('cxcalc'); +cxcalcInstalled = ~cxcalcInstalled; +if cxcalcInstalled == 0 + cxcalcInstalled = false; + display('cxcalc is not installed, two features cannot be used: ') + display('1 - jpeg files for molecular structures (obabel required)') + display('2 - pH adjustment according to model.met Formulas') +end +[oBabelInstalled, ~] = system('obabel'); +if oBabelInstalled ~= 1 + oBabelInstalled = false; + options.standardisationApproach = 'basic'; + display('obabel is not installed, two features cannot be used: ') + display('1 - Generation of SMILES, InChI and InChIkey') + display('2 - MOL file standardisation') +end +[javaInstalled, ~] = system('java'); +if javaInstalled ~= 1 && ~options.onlyUnmapped + display('java is not installed, atom mappings cannot be computed') + options.onlyUnmapped = true; +end + +% Start diary +if ~isfolder(outputDir) + mkdir(outputDir); +end +diaryFilename = [outputDir datestr(now,30) '_DatabaseDiary.txt']; +diary(diaryFilename) + +if options.printlevel > 0 + disp('--------------------------------------------------------------') + disp('CHEMICAL DATABASE') + disp('--------------------------------------------------------------') + disp(' ') + fprintf('%s\n', 'Generating a chemical database with the following options:') + disp(' ') + disp(options) + disp('--------------------------------------------------------------') +end + +directories = {'inchi'; 'smiles'; 'KEGG'; 'HMDB'; 'PubChem'; 'CHEBI'}; +if dirsToCompare + directories = [directories; options.dirNames]; +end + +mets = regexprep(model.mets, '(\[\w\])', ''); +umets = unique(mets); + +%% 2. Obtain metabolite structures from different sources + +% SOURCES +% 1.- InChI (requires openBabel to obtain MOL file) +% 2.- Smiles (requires openBabel to obtain MOL file) +% 3.- KEGG (https://www.genome.jp/) +% 4.- HMDB (https://hmdb.ca/) +% 5.- PubChem (https://pubchem.ncbi.nlm.nih.gov/) +% 6.- CHEBI (https://www.ebi.ac.uk/) + +if options.printlevel > 0 + fprintf('%s\n\n', 'Obtaining MOL files from chemical databases ...') +end + +comparisonDir = [metDir filesep 'molComparison' filesep]; +source = [0 0 0 0 0 0 0]; +for i = 1:6 + dirBool(i) = false; + if any(~cellfun(@isempty, regexpi(modelFields, directories{i}))) + dirBool(i) = true; + sourceData = source; + sourceData(i + 1) = source(i + 1) + i + 1; + molCollectionReport = obtainMetStructures(model, comparisonDir, false, [], sourceData); + movefile([comparisonDir filesep 'newMol'], ... + [comparisonDir filesep directories{i}]) + info.sourcesCoverage.(directories{i}) = molCollectionReport; + info.sourcesCoverage.totalCoverage(i) = molCollectionReport.noOfMets; + info.sourcesCoverage.source{i} = directories{i}; + if options.printlevel > 0 + disp([directories{i} ':']) + display(molCollectionReport) + end + end +end +if ~isempty(dirsToCompare) + for i = 1:length(options.dirsToCompare) + % Get list of MOL files + d = dir(options.dirsToCompare{i}); + d = d(~[d.isdir]); + metList = {d.name}'; + metList = metList(~cellfun('isempty', regexp(metList,'(\.mol)$'))); + metList = regexprep(metList, '.mol', ''); + metList(~ismember(metList, umets)) = []; + info.sourcesCoverage.totalCoverage(i + 6) = length(metList); + info.sourcesCoverage.source{i + 6} = options.dirNames{i}; + end +end +% Remove sources without a single metabolite present the model +if dirsToCompare + emptySourceBool = info.sourcesCoverage.totalCoverage == 0; + info.sourcesCoverage.totalCoverage(emptySourceBool) = []; + directories(emptySourceBool) = []; + dirsToDeleteBool = ismember(options.dirNames, info.sourcesCoverage.source(emptySourceBool)); + options.dirsToCompare(dirsToDeleteBool) = []; + options.dirNames(dirsToDeleteBool) = []; + info.sourcesCoverage.source(emptySourceBool) = []; +else + directories(~dirBool) = []; +end + +if options.debug + save([outputDir '2.debug_afterDownloadMetabolicStructures.mat']) +end + +%% 3. Compare MOL files downloaded and save the best match + +if options.printlevel > 0 + fprintf('%s\n', 'Comparing information from sources ...') +end + +for i = 1:size(directories, 1) + + % Set dir + if i > 6 && dirsToCompare + sourceDir = options.dirsToCompare{i - 6}; + else + sourceDir = [comparisonDir directories{i} filesep]; + end + + + % Get list of MOL files + d = dir(sourceDir); + d = d(~[d.isdir]); + metList = {d.name}'; + metList = metList(~cellfun('isempty', regexp(metList,'(\.mol)$'))); + metList = regexprep(metList, '.mol', ''); + metList(~ismember(metList, mets)) = []; + + warning('off') + for j = 1:size(metList, 1) + name = [metList{j} '.mol']; + + if oBabelInstalled + + % Get inchis of the original metabolites + command = ['obabel -imol ' sourceDir name ' -oinchi ']; + [~, result] = system(command); + result = split(result); + + % Group inchis in the correct group + if any(~cellfun(@isempty, regexp(result, 'InChI=1S'))) + + % Create InChI table + if ~exist('groupedInChIs','var') + groupedInChIs = table(); + end + % Identify the correct index + if ~ismember('metNames', groupedInChIs.Properties.VariableNames) + groupedInChIs.metNames{j} = regexprep(name, '.mol', ''); + idx = 1; + elseif ~ismember(regexprep(name, '.mol', ''), groupedInChIs.metNames) + idx = size(groupedInChIs.metNames, 1) + 1; + groupedInChIs.metNames{idx} = regexprep(name, '.mol', ''); + else + idx = find(ismember(groupedInChIs.metNames, regexprep(name, '.mol', ''))); + end + % Save inchi in the table + groupedInChIs.(directories{i}){idx} = result{~cellfun(@isempty, regexp(result, 'InChI=1S'))}; + + else + + % Create SMILES table for molecules with R groups + if ~exist('groupedSMILES','var') + groupedSMILES = table(); + end + % Identify the correct index + if ~ismember('metNames', groupedSMILES.Properties.VariableNames) + groupedSMILES.metNames{1} = regexprep(name, '.mol', ''); + idx = 1; + elseif ~ismember(regexprep(name, '.mol', ''), groupedSMILES.metNames) + idx = size(groupedSMILES.metNames, 1) + 1; + groupedSMILES.metNames{idx} = regexprep(name, '.mol', ''); + else + idx = find(ismember(groupedSMILES.metNames, regexprep(name, '.mol', ''))); + end + % Get SMILES + command = ['obabel -imol ' sourceDir name ' -osmiles ']; + [~, result] = system(command); + if contains(result, '0 molecules converted') + continue + end + result = splitlines(result); + result = split(result{end - 2}); + % Save SMILES in the table + groupedSMILES.(directories{i}){idx} = result{1}; + + end + + else + + % Create SMILES table for molecules with R groups + if ~exist('groupedFormula','var') + groupedFormula = table(); + end + % Identify the correct index + if ~ismember('metNames', groupedFormula.Properties.VariableNames) + groupedFormula.metNames{1} = regexprep(name, '.mol', ''); + idx = 1; + elseif ~ismember(regexprep(name, '.mol', ''), groupedFormula.metNames) + idx = size(groupedFormula.metNames, 1) + 1; + groupedFormula.metNames{idx} = regexprep(name, '.mol', ''); + else + idx = find(ismember(groupedFormula.metNames, regexprep(name, '.mol', ''))); + end + % Get formula from MOL + molFile = regexp(fileread([sourceDir name]), '\n', 'split')'; + atomsString = []; + for k = 1:str2num(molFile{4}(1:3)) + atomsString = [atomsString strtrim(molFile{4 + k}(32:33))]; + end + groupedFormula.(directories{i}){idx} = editChemicalFormula(atomsString); + + end + end + warning('on') +end + +if options.debug + save([outputDir '3a.debug_beforeComparison.mat']) +end + +if oBabelInstalled + + % Obtain the most consistent molecules + reportComparisonInChIs = consistentData(model, groupedInChIs, 'InChI'); + fields = fieldnames(reportComparisonInChIs); + metsInStruct = regexprep(fields(contains(fields, 'met_')), 'met_', ''); + emptyBool = cellfun(@isempty, reportComparisonInChIs.sourcesToSave); + source = reportComparisonInChIs.sourcesToSave(~emptyBool); + + % Obtain the most consistent for molecules with R groups + if exist('groupedSMILES','var') + reportComparisonSMILES = consistentData(model, groupedSMILES, 'SMILES'); + fields = fieldnames(reportComparisonSMILES); + metNamesSmiles = regexprep(fields(contains(fields, 'met_')), 'met_', ''); + metsInStruct(end + 1: end + size(metNamesSmiles, 1)) = metNamesSmiles; + emptyBool = cellfun(@isempty, reportComparisonSMILES.sourcesToSave); + source(end + 1: end + size(metNamesSmiles, 1)) = reportComparisonSMILES.sourcesToSave(~emptyBool); + end + + nRows = size(metsInStruct, 1); + varTypes = {'string', 'string', 'string', 'string', 'string', 'double', 'double', 'string'}; + varNames = {'mets', 'sourceWithHighestScore', 'metNames', 'metFormulas', 'sourceFormula', 'layersInChI', 'socre', 'idUsed'}; + info.comparisonTable = table('Size', [nRows length(varTypes)], 'VariableTypes', varTypes, 'VariableNames', varNames); + + fields = fieldnames(reportComparisonInChIs); + metFields = fields(contains(fields, 'met_')); + for i = 1:size(metFields, 1) + info.comparisonTable.mets(i) = metsInStruct(i); + info.comparisonTable.sourceWithHighestScore(i) = source(i); + if isfield(model, 'metNames') + info.comparisonTable.metNames(i) = reportComparisonInChIs.(metFields{i}).metNames; + end + info.comparisonTable.metFormulas(i) = reportComparisonInChIs.(metFields{i}).metFormula; + selectedIdIdx = find(reportComparisonInChIs.(metFields{i}).idsScore == max(reportComparisonInChIs.(metFields{i}).idsScore)); + info.comparisonTable.sourceFormula(i) = reportComparisonInChIs.(metFields{i}).sourceFormula(selectedIdIdx(1)); + info.comparisonTable.layersInChI(i) = reportComparisonInChIs.(metFields{i}).layersOfDataInChI(selectedIdIdx(1)); + info.comparisonTable.idUsed(i) = reportComparisonInChIs.(metFields{i}).ids(selectedIdIdx(1)); + info.comparisonTable.socre(i) = reportComparisonInChIs.(metFields{i}).idsScore(selectedIdIdx(1)); + end + if exist('groupedSMILES','var') + fields = fieldnames(reportComparisonSMILES); + metFields = fields(contains(fields, 'met_')); + startFrom = i; + for i = 1:size(metFields, 1) + info.comparisonTable.mets(i + startFrom) = metsInStruct(i + startFrom); + info.comparisonTable.sourceWithHighestScore(i + startFrom) = source(i + startFrom); + if isfield(model, 'metNames') + info.comparisonTable.metNames(i + startFrom, 1) = reportComparisonSMILES.(metFields{i}).metNames; + end + info.comparisonTable.metFormulas(i + startFrom) = char(reportComparisonSMILES.(metFields{i}).metFormula); + selectedIdIdx = find(reportComparisonSMILES.(metFields{i}).idsScore == max(reportComparisonSMILES.(metFields{i}).idsScore)); + info.comparisonTable.sourceFormula(i + startFrom) = reportComparisonSMILES.(metFields{i}).sourceFormula(selectedIdIdx(1)); + info.comparisonTable.layersInChI(i + startFrom) = NaN; + info.comparisonTable.idUsed(i + startFrom) = reportComparisonSMILES.(metFields{i}).ids(selectedIdIdx(1)); + info.comparisonTable.socre(i + startFrom) = reportComparisonSMILES.(metFields{i}).idsScore(selectedIdIdx(1)); + end + end + + % Print data + if options.printlevel > 0 + display(info.comparisonTable) + + % heatMap comparison + figure + comparison = reportComparisonInChIs.comparison; + if exist('groupedSMILES','var') + rows = size(comparison, 1); + comparison = [comparison; zeros(size(reportComparisonSMILES.comparison, 1),... + size(comparison, 2))]; + bool = ismember(reportComparisonInChIs.sources, reportComparisonSMILES.sources); + comparison(rows + 1:end, bool) = reportComparisonSMILES.comparison; + end + for i = 1:size(comparison, 2) + for j = 1:size(comparison, 2) + boolToCompare = ~isnan(comparison(:, i)) & ~isnan(comparison(:, j)); + group1 = comparison(boolToCompare, i); + group2 = comparison(boolToCompare, j); + comparisonMatrix(i, j) = sqrt(sum((group1 - group2).^2)); + end + end + h = heatmap(comparisonMatrix); + h.YDisplayLabels = directories; + h.XDisplayLabels = directories; + h.FontSize = 16; + title('Sources disimilarity comparison') + + % Sources comparison + figure + [db, ~, idx] = unique(split(strjoin(source, ' '), ' ')); + [~, ib1] = ismember(db, directories); + yyaxis left + bar(histcounts(idx, size(db, 1))) + title({'Sources comparison', ... + ['Metabolites collected: ' num2str(size(info.comparisonTable, 1))]}, 'FontSize', 20) + ylabel('Times with highest score', 'FontSize', 18) + set(gca, 'XTick', 1:size(db, 1), 'xticklabel', db, 'FontSize', 18) + xtickangle(45) + yyaxis right + bar(info.sourcesCoverage.totalCoverage(ib1), 0.3) + ylabel('IDs coverage', 'FontSize', 20) + + if options.printlevel > 1 + display(groupedInChIs) + end + end + +else + + % Obtain the most consistent molecules + reportComparisonFormula = consistentData(model, groupedFormula, 'MOL'); + fields = fieldnames(reportComparisonFormula); + metsInStruct = regexprep(fields(contains(fields, 'met_')), 'met_', ''); + emptyBool = cellfun(@isempty, reportComparisonFormula.sourcesToSave); + source = reportComparisonFormula.sourcesToSave(~emptyBool); + + nRows = size(metsInStruct, 1); + varTypes = {'string', 'string', 'string', 'string', 'string', 'double', 'double', 'string'}; + varNames = {'mets', 'sourceWithHighestScore', 'metNames', 'metFormulas', 'sourceFormula', 'layersInChI', 'socre', 'idUsed'}; + info.comparisonTable = table('Size', [nRows length(varTypes)], 'VariableTypes', varTypes, 'VariableNames', varNames); + + fields = fieldnames(reportComparisonFormula); + metFields = fields(contains(fields, 'met_')); + for i = 1:size(metFields, 1) + info.comparisonTable.mets(i) = metsInStruct(i); + info.comparisonTable.sourceWithHighestScore(i) = source(i); + if isfield(model, 'metNames') + info.comparisonTable.metNames(i) = reportComparisonFormula.(metFields{i}).metNames; + end + info.comparisonTable.metFormulas(i) = char(reportComparisonFormula.(metFields{i}).metFormula); + selectedIdIdx = find(reportComparisonFormula.(metFields{i}).idsScore == max(reportComparisonFormula.(metFields{i}).idsScore)); + info.comparisonTable.sourceFormula(i) = reportComparisonFormula.(metFields{i}).sourceFormula(selectedIdIdx(1)); + info.comparisonTable.idUsed(i) = reportComparisonFormula.(metFields{i}).ids(selectedIdIdx(1)); + info.comparisonTable.socre(i) = reportComparisonFormula.(metFields{i}).idsScore(selectedIdIdx(1)); + end + + if options.printlevel > 0 + display(info.comparisonTable) + + % heatMap comparison + figure + comparison = reportComparisonFormula.comparison; + for i = 1:size(comparison, 2) + for j = 1:size(comparison, 2) + % Ignore NaN data + boolToCompare = ~isnan(comparison(:, i)) & ~isnan(comparison(:, j)); + group1 = comparison(boolToCompare, i); + group2 = comparison(boolToCompare, j); + comparisonMatrix(i, j) = sqrt(sum((group1 - group2).^2)); + end + end + h = heatmap(comparisonMatrix); + h.YDisplayLabels = directories; + h.XDisplayLabels = directories; + title('Dissimilarity comparison') + + % Sources comparison + figure + [db, ~, idx] = unique(split(strjoin(source, ' '), ' ')); + bar(histcounts(idx, size(db, 1))) + title({'Source of molecules with the highest score', ... + ['Total ' num2str(size(info.comparisonTable, 1))]}, 'FontSize', 20) + ylabel('Times ', 'FontSize', 18) + set(gca, 'XTick', 1:size(db, 1), 'xticklabel', db, 'FontSize', 18) + xtickangle(45) + + if options.printlevel > 1 + display(groupedFormula) + end + end + +end + +% Save the MOL files with highest score +tmpDir = [metDir filesep 'tmp']; +if ~isfolder(tmpDir) + mkdir(tmpDir) +end +source = source(ismember(metsInStruct, unique(regexprep(model.mets, '(\[\w\])', '')))); +metsInStruct = metsInStruct(ismember(metsInStruct, unique(regexprep(model.mets, '(\[\w\])', '')))); +for i = 1:size(metsInStruct, 1) + dirToCopy = strsplit(source{i}, ' '); + if dirsToCompare && ismember(dirToCopy{1}, options.dirNames) + copyfile([options.dirsToCompare{ismember(options.dirNames, dirToCopy{1})} metsInStruct{i} '.mol'], tmpDir) + else + copyfile([comparisonDir dirToCopy{1} filesep metsInStruct{i} '.mol'], tmpDir) + end +end +if ~options.keepMolComparison + rmdir(comparisonDir, 's') +end +if ~options.adjustToModelpH || ~cxcalcInstalled + model.comparison = info.comparisonTable; +end +if options.debug + save([outputDir '3b.debug_afterComparison.mat']) +end + +%% 4. Adjust pH based on the model's chemical formula + +if options.adjustToModelpH && cxcalcInstalled + + info.adjustedpHTable = info.comparisonTable; + + if options.printlevel > 0 + fprintf('%s\n', 'Adjusting pH based on the model''s chemical formula ...') + display(' ') + end + + [needAdjustmentBool, differentFormula, loopError, pHRangePassed] = deal(false(size(info.comparisonTable, 1), 1)); + for i = 1:size(info.comparisonTable, 1) + try + + name = [info.comparisonTable.mets{i} '.mol']; + + if isequal(regexprep(info.comparisonTable.metFormulas(i), 'H\d*', ''),... + regexprep(info.comparisonTable.sourceFormula(i), 'H\d*', '')) % pH different only + + % Get number of hydrogens in the model's metabolite + [elemetList, ~ , elemetEnd] = regexp(info.comparisonTable.metFormulas(i), ['[', 'A':'Z', '][', 'a':'z', ']?'], 'match'); + hBool = contains(elemetList, 'H'); + [num, numStart] = regexp(info.comparisonTable.metFormulas(i), '\d+', 'match'); + numList = ones(size(elemetList)); + numList(ismember(elemetEnd + 1, numStart)) = cellfun(@str2num, num); + noOfH_model = numList(hBool); + + % Get number of hydrogens in the source's metabolite + [elemetList, ~ , elemetEnd] = regexp(info.comparisonTable.sourceFormula(i), ['[', 'A':'Z', '][', 'a':'z', ']?'], 'match'); + hBool = contains(elemetList, 'H'); + [num, numStart] = regexp(info.comparisonTable.sourceFormula(i), '\d+', 'match'); + numList = ones(size(elemetList)); + idx = ismember(elemetEnd + 1, numStart); + numList(idx) = cellfun(@str2num, num); + noOfH_source = numList(hBool); + + % Start with a neutral pH + pH = 7; + while noOfH_model ~= noOfH_source + + if ~needAdjustmentBool(i) + needAdjustmentBool(i) = true; + if noOfH_model - noOfH_source > 0 + pHDifference = -0.33; + else + pHDifference = 0.33; + end + end + + % Change pH + command = ['cxcalc majormicrospecies -H ' num2str(pH) ' -f mol ' tmpDir '' filesep name]; + [~, result] = system(command); + molFile = regexp(result, '\n', 'split')'; + fid2 = fopen([tmpDir filesep 'tmp.mol'], 'w'); + fprintf(fid2, '%s\n', molFile{:}); + fclose(fid2); + % Obtain the chemical formula + command = ['cxcalc elementalanalysistable -t "formula" ' tmpDir filesep 'tmp.mol']; + [~, formula] = system(command); + formula = split(formula); + formula = formula{end - 1}; + % Get number of hydrogens in the adjusted metabolite + [elemetList, ~ , elemetEnd] = regexp(formula, ['[', 'A':'Z', '][', 'a':'z', ']?'], 'match'); + hBool = contains(elemetList, 'H'); + [num, numStart] = regexp(formula, '\d+', 'match'); + numList = ones(size(elemetList)); + idx = ismember(elemetEnd + 1, numStart); + numList(idx) = cellfun(@str2num, num); + noOfH_source = numList(hBool); + + if pH <= 0 || pH >= 14 + pHRangePassed(i) = true; + break + end + pH = pH + pHDifference; + + if noOfH_model == noOfH_source + movefile([tmpDir filesep 'tmp.mol'], [tmpDir filesep name]) + info.adjustedpHTable.sourceFormula(i) = formula; + end + end + else + differentFormula(i) = true; + end + catch + loopError(i) = true; + end + end + + if isfile([tmpDir filesep 'tmp.mol']) + delete([tmpDir filesep 'tmp.mol']) + end + + info.adjustedpHTable.needAdjustmentBool = needAdjustmentBool; + info.adjustedpHTable.notPossible2AdjustBool = differentFormula | loopError | pHRangePassed; + info.adjustedpHTable.differentFormula = differentFormula; + info.adjustedpHTable.loopError = loopError; + info.adjustedpHTable.pHRangePassed = pHRangePassed; + + if options.printlevel > 0 + display('adjustedpH:') + display(info.adjustedpHTable) + end + + model.comparison = info.adjustedpHTable; + +end + +%% 5. Standardise the MOL files according options + +standardisationApproach = options.standardisationApproach; + +% Get list of MOL files +d = dir(tmpDir); +d = d(~[d.isdir]); +metList = {d.name}'; +metList = metList(~cellfun('isempty', regexp(metList,'(\.mol)$'))); +metList = regexprep(metList, '.mol', ''); +metList(~ismember(metList, regexprep(model.mets, '(\[\w\])', ''))) = []; + +% Standardise MOL files the most consitent MOL files +standardisationReport = standardiseMolDatabase(tmpDir, metList, metDir, standardisationApproach); +info.standardisationReport = standardisationReport; + +if oBabelInstalled + % Create table + nRows = size(standardisationReport.SMILES, 1); + varTypes = {'string', 'string', 'string', 'string'}; + varNames = {'mets', 'InChIKeys', 'InChIs', 'SMILES'}; + info.standardisationReport = table('Size', [nRows length(varTypes)], 'VariableTypes', varTypes, 'VariableNames', varNames); + info.standardisationReport.mets(1:end) = standardisationReport.standardised; + info.standardisationReport.InChIKeys(1:size(standardisationReport.InChIKeys, 1)) = standardisationReport.InChIKeys; + info.standardisationReport.InChIs(1:size(standardisationReport.InChIs, 1)) = standardisationReport.InChIs; + info.standardisationReport.SMILES(1:size(standardisationReport.SMILES, 1)) = standardisationReport.SMILES; + % Write table + writetable(info.standardisationReport, [metDir filesep 'standardisationReport']) +end + +if options.printlevel > 0 + display(info.standardisationReport) +end +rmdir(tmpDir, 's') +model.standardisation = info.standardisationReport; + +if options.debug + save([outputDir '5.debug_afterStandardisation.mat']) +end + +%% 6. Atom map data + +% Set options + +% MOL file directory +molFileDir = [metDir filesep 'molFiles']; + +% Create the reaction data directory +if ~isfolder(rxnDir) + mkdir(rxnDir) +end + +% Reactions to atom map +rxnsToAM = model.rxns; + +% Keep standardisation approach used with the molecular structures +switch options.standardisationApproach + case 'explicitH' + hMapping = true; + case 'implicitH' + hMapping = false; + case 'neutral' + case 'basic' + hMapping = true; +end + +% Atom map metabolic reactions +reactionsReport = obtainAtomMappingsRDT(model, molFileDir, rxnDir, rxnsToAM, hMapping, options.onlyUnmapped); +info.reactionsReport = reactionsReport; + + +rxnsFilesDir = [rxnDir filesep 'unMapped']; +if ~options.onlyUnmapped + + % Atom map transport reactions + mappedRxns = transportRxnAM(rxnsFilesDir, [rxnDir filesep 'atomMapped']); + for i = 1:size(mappedRxns, 2) + delete([rxnDir filesep 'images' filesep mappedRxns{i} '.png']); + end + + % Generate rinchis and reaction SMILES + if oBabelInstalled + + nRows = size(rxnsToAM, 1); + varTypes = {'string', 'string', 'string'}; + varNames = {'rxns', 'rinchi', 'rsmi'}; + info.reactionsReport.rxnxIDsTable = table('Size', [nRows length(varTypes)], 'VariableTypes', varTypes, 'VariableNames', varNames); + + model.rinchi = repmat({''}, size(model.rxns)); + model.rsmi = repmat({''}, size(model.rxns)); + for i = 1:size(rxnsToAM, 1) + info.reactionsReport.rxnxIDsTable.rxns(i) = rxnsToAM(i); + if isfile([rxnDir filesep 'atomMapped' filesep rxnsToAM{i} '.rxn']) + % Get rinchis + command = ['obabel -irxn ' rxnDir filesep 'atomMapped' filesep rxnsToAM{i} '.rxn -orinchi']; + [~, result] = system(command); + result = split(result); + info.reactionsReport.rxnxIDsTable.rinchi(i) = result{~cellfun(@isempty, regexp(result, 'RInChI='))}; + model.rinchi{findRxnIDs(model, rxnsToAM{i})} = result{~cellfun(@isempty, regexp(result, 'RInChI='))}; + % Get reaction SMILES + command = ['obabel -irxn ' rxnDir filesep 'atomMapped' filesep rxnsToAM{i} '.rxn -osmi']; + [~, result] = system(command); + result = splitlines(result); + result = split(result{end - 2}); + info.reactionsReport.rxnxIDsTable.rsmi(i) = result{1}; + model.rsmi{findRxnIDs(model, rxnsToAM{i}), 1} = result{1}; + end + end + end +end + +% Find unbalanced RXN files +% Get list of RXN files to check +d = dir(rxnsFilesDir); +d = d(~[d.isdir]); +rxnList = {d.name}'; +rxnList = rxnList(~cellfun('isempty', regexp(rxnList,'(\.rxn)$'))); +rxnList = regexprep(rxnList, '.rxn', ''); +rxnList(~ismember(rxnList, rxnsToAM)) = []; + +[unbalancedBool, v3000] = deal(false(size(rxnList))); +for i = 1:size(rxnList, 1) + + name = [rxnList{i} '.rxn']; + % Read the RXN file + rxnFile = regexp(fileread([rxnsFilesDir filesep name]), '\n', 'split')'; + + % Identify molecules + substrates = str2double(rxnFile{5}(1:3)); + products = str2double(rxnFile{5}(4:6)); + begMol = strmatch('$MOL', rxnFile); + + if ~isnan(products) + % Count atoms in substrates and products + atomsS = 0; + for j = 1:substrates + atomsS = atomsS + str2double(rxnFile{begMol(j) + 4}(1:3)); + end + atomsP = 0; + for j = substrates + 1: substrates +products + atomsP = atomsP + str2double(rxnFile{begMol(j) + 4}(1:3)); + end + + % Check if the file is unbalanced + if atomsS ~= atomsP + unbalancedBool(i) = true; + end + else + v3000(i) = true; + end +end + +info.reactionsReport.balancedReactions = rxnList(~unbalancedBool); +info.reactionsReport.unbalancedReactions = rxnList(unbalancedBool); +model = findSExRxnInd(model); +info.reactionsReport.rxnMissing = setdiff(model.rxns(model.SIntRxnBool), info.reactionsReport.rxnFilesWritten); + +metsInBalanced = unique(regexprep(findMetsFromRxns(model, rxnList(~unbalancedBool)), '(\[\w\])', '')); +metsInUnbalanced = unique(regexprep(findMetsFromRxns(model, rxnList(unbalancedBool)), '(\[\w\])', '')); +info.reactionsReport.metsAllwaysInBalancedRxns = umets(ismember(umets, setdiff(metsInBalanced, metsInUnbalanced))); +info.reactionsReport.metsSometimesInUnbalancedRxns = umets(ismember(umets, intersect(metsInBalanced, metsInUnbalanced))); +info.reactionsReport.metsAllwaysInUnbalancedRxns = umets(ismember(umets, setdiff(metsInUnbalanced, metsInBalanced))); +info.reactionsReport.missingMets = setdiff(umets, [metsInBalanced; metsInUnbalanced]); + +info.reactionsReport.table = table([... + size(info.reactionsReport.rxnFilesWritten, 1);... + size(info.reactionsReport.mappedRxns, 1);... + size(info.reactionsReport.balancedReactions, 1);... + size(info.reactionsReport.unbalancedReactions, 1);... + size(info.reactionsReport.rxnMissing, 1);... + size(info.reactionsReport.metsAllwaysInBalancedRxns, 1) + ... + size(info.reactionsReport.metsSometimesInUnbalancedRxns, 1) + ... + size(info.reactionsReport.metsAllwaysInUnbalancedRxns, 1);... + size(info.reactionsReport.metsAllwaysInBalancedRxns, 1) + ... + size(info.reactionsReport.metsSometimesInUnbalancedRxns, 1);... + size(info.reactionsReport.metsAllwaysInUnbalancedRxns, 1);... + size(info.reactionsReport.missingMets, 1)],... + ... + 'VariableNames', ... + {'Var'},... + 'RowNames',... + {'RXN files written'; ... + 'Atom mapped reactions';... + 'Balanced reactions';... + 'Unbalanced reactions';... + 'Missing reactions';... + 'Metabolites obtained'; ... + 'Metabolites in balanced rxns'; ... + 'Metabolites allways in unbalanced rxns'; ... + 'Missing metabolites'}); + +if options.printlevel > 0 + + if ~options.onlyUnmapped + display(info.reactionsReport.rxnxIDsTable) + end + + display(info.reactionsReport.table) + + % Reactions + figure + labelsToAdd = {'Balanced', 'Unbalanced', 'Missing'}; + X = [size(info.reactionsReport.balancedReactions, 1);... + size(info.reactionsReport.unbalancedReactions, 1);... + size(info.reactionsReport.rxnMissing, 1)]; + pieChart = pie(X(find(X))); + title({'RXN coverage', ['From ' num2str(sum(X)) ' internal rxns in the model']}, 'FontSize', 20) + legend(labelsToAdd(find(X)), 'FontSize', 16) + set(findobj(pieChart,'type','text'),'fontsize',18) + + % Metabolites + figure + labelsToAdd = {'In balanced rxn', 'Ocassionally in unbalanced rxn', 'In unbalanced rxn', 'Missing'}; + X = [size(info.reactionsReport.metsAllwaysInBalancedRxns, 1);... + size(info.reactionsReport.metsSometimesInUnbalancedRxns, 1);... + size(info.reactionsReport.metsAllwaysInUnbalancedRxns, 1);... + size(info.reactionsReport.missingMets, 1)]; + pieChart = pie(X(find(X))); + title({'Met percentage coverage', ['From ' num2str(size(umets, 1)) ' unique mets in the model']}, 'FontSize', 20) + legend(labelsToAdd(find(X)), 'FontSize', 16) + set(findobj(pieChart,'type','text'),'fontsize',18) + +end +if options.printlevel > 1 + disp('RXN files written') + display(info.reactionsReport.rxnFilesWritten) + disp('Atom mapped reactions') + display(info.reactionsReport.mappedRxns) + disp('Balanced reactions') + display(info.reactionsReport.balancedReactions) + disp('Unbalanced reactions') + display(info.reactionsReport.unbalancedReactions) + disp('Metabolites allways in balanced rxns') + display(info.reactionsReport.metsAllwaysInBalancedRxns) + disp('Metabolites ocasional in unbalanced rxns') + display(info.reactionsReport.metsSometimesInUnbalancedRxns) + disp('Metabolites allways in unbalanced rxns') + display(info.reactionsReport.metsAllwaysInUnbalancedRxns) + disp('Missing metabolites') + display(info.reactionsReport.missingMets) +end + +if options.debug + save([outputDir '6.debug_endOfreactionDatabase.mat']) +end + +%% 7. Bond enthalpies and bonds broken and formed + +if ~options.onlyUnmapped + + % Get bond enthalpies and bonds broken and formed + if options.printlevel > 0 + display('Obtaining RInChIes and reaction SMILES ...') + [bondsBF, bondsE, meanBBF, meanBE] = findBEandBBF(model, [rxnDir filesep 'atomMapped'], 1); + info.bondsData.bondsDataTable = table(model.rxns, model.rxnNames, bondsBF, bondsE, ... + 'VariableNames', {'rxns', 'rxnNames', 'bondsBF', 'bondsE'}); + info.bondsData.meanBBF = meanBBF; + info.bondsData.meanBE = meanBE; + display(info.bondsData.bondsDataTable) + else + [bondsBF, bondsE, meanBBF, meanBE] = findBEandBBF(model, [rxnDir filesep 'atomMapped']); + info.bondsData.table = table(model.rxns, model.rxnNames, bondsBF, bondsE, ... + 'VariableNames', {'rxns','rxnNames','bondsBF','bondsE'}); + info.bondsData.table = sortrows(info.bondsData.table, {'bondsBF'}, {'descend'}); + info.bondsData.meanBBF = meanBBF; + info.bondsData.meanBE = meanBE; + end + + % Add data in the model + model.bondsBF = bondsBF; + model.bondsE = bondsE; + model.meanBBF = meanBBF; + model.meanBE = meanBE; + +end + +newModel = model; +if options.debug + save([outputDir '7.debug_endOfGenerateChemicalDatabase.mat']) +end + +diary off +if options.printlevel > 0 > 0 + fprintf('%s\n', ['Diary written to: ' options.outputDir]) + fprintf('%s\n', 'generateChemicalDatabase run is complete.') +end + +end + +function reportComparison = consistentData(model, groupedIDs, typeID) +% The most cross-validated ID is saved. It's compared molecular formula of +% the model, the charge and the similarity with other IDs + +% Compare and sort results based on prime numbers. The smallest prime +% number represent the most similar inchis between databases +IDsArray = table2cell(groupedIDs); + +% Start report +reportComparison.groupedID = groupedIDs; +reportComparison.sources = groupedIDs.Properties.VariableNames(2:end)'; +reportComparison.sourcesToSave = cell(size(IDsArray, 1), 1); + +% Delete the Names +metNames = IDsArray(:, 1); +IDsArray(:, 1) = []; +for i = 1:size(IDsArray, 1) + + % 1st comparison - Similarity + emptyBool = cellfun(@isempty, IDsArray(i, 1:end)); + IDsArray(i, emptyBool) = {'noData'}; + [~, ia, ic] = unique(IDsArray(i, 1:end)); + ic(emptyBool) = NaN; + ia(contains(IDsArray(i, ia), 'noData')) = []; + if ~isempty(ia) + ia = sort(ia); + % Acsending values + c = 0; + icAcsending = ic; + for j = 1:size(ia, 1) + if ~isnan(ic(ia(j))) + c = c + 1; + icAcsending(ic == ic(ia(j))) = c; + end + end + comparison(i, :) = icAcsending'; + + % 2nd comparison - cross-validation & chemical formulas comparison & + % InChI data (if is inchi) + + % Cross-validation (assign score) + idsScore = zeros(size(ia, 1), 1); + for j = 1:size(ia, 1) + idsScore(j) = idsScore(j) + (sum(ic == ic(ia(j))) / size(ic, 1)); + end + + % Chemical formula comparison + % Get model formula + metIdx = find(ismember(regexprep(model.mets, '(\[\w\])', ''), groupedIDs.metNames{i})); + rGroup = ["X", "Y", "*", "FULLR"]; + if contains(model.metFormulas(metIdx(1)), rGroup) + modelFormula = editChemicalFormula(model.metFormulas{metIdx(1)}); + else + modelFormula = model.metFormulas{metIdx(1)}; + end + if isfield(model, 'metNames') + reportComparison.(['met_' groupedIDs.metNames{i}]).metNames = model.metNames(metIdx(1)); + end + reportComparison.(['met_' groupedIDs.metNames{i}]).metFormula = {modelFormula}; + % Get ID formula + reportComparison.(['met_' groupedIDs.metNames{i}]).uniqueIdIdx = ia; + for j = 1:size(ia, 1) + consistentID = IDsArray{i, ia(j)}; + + switch typeID + case 'InChI' + [elemetList, ~ , ~] = regexp(consistentID, '/([^/]*)/', 'match'); + if isempty(elemetList) + IDformula = ''; + else + IDformula = regexprep(elemetList{1}, '/', ''); + end + + case 'SMILES' + + % Get formula from MOL + currentDir = pwd; + fid2 = fopen([currentDir filesep 'tmp'], 'w'); + fprintf(fid2, '%s\n', consistentID); + fclose(fid2); + command = 'obabel -ismi tmp -O tmp.mol mol -h'; + [~, ~] = system(command); + molFile = regexp(fileread([currentDir filesep 'tmp.mol']), '\n', 'split')'; + if ~isempty(char(molFile)) + atomsString = []; + for k = 1:str2num(molFile{4}(1:3)) + atomsString = [atomsString strtrim(molFile{4 + k}(32:33))]; + end + IDformula = editChemicalFormula(atomsString); + end + delete([currentDir filesep 'tmp.mol']) + + case 'MOL' + IDformula = consistentID; + end + reportComparison.(['met_' groupedIDs.metNames{i}]).sourceFormula{j, 1} = IDformula; + + % Compare (assign score) + if isequal(modelFormula, IDformula) + idsScore(j) = idsScore(j) + 10; % ten times the scale in cross-validation + elseif isequal(regexprep(modelFormula, 'H\d*', ''), regexprep(IDformula, 'H\d*', '')) % pH different + idsScore(j) = idsScore(j) + 8; + end + + % InChI layers comparison + switch typeID + case 'InChI' + inchiLayersDetail = getInchiData(IDsArray{i, ia(j)}); + layersOfDataInChI(j, 1) = inchiLayersDetail.layers; + idsScore(j) = idsScore(j) + 4 + layersOfDataInChI(j, 1); % scale in formula comparison + reportComparison.(['met_' groupedIDs.metNames{i}]).layersOfDataInChI(j, 1) = ... + layersOfDataInChI(j, 1); + + case 'SMILES' + + fid2 = fopen([currentDir filesep 'tmp'], 'w'); + fprintf(fid2, '%s\n', IDsArray{i, ia(j)}); + fclose(fid2); + command = 'obabel -ismi tmp -O tmp.mol mol -h'; + [~, ~] = system(command); + molFile = regexp(fileread([currentDir filesep 'tmp.mol']), '\n', 'split')'; + molFile = regexprep(molFile, 'X|Y|*|R|A', 'H'); + fid2 = fopen([currentDir filesep 'tmp.mol'], 'w'); + fprintf(fid2, '%s\n', molFile{:}); + fclose(fid2); + % Get inchis of the original metabolites + command = ['obabel -imol ' currentDir filesep 'tmp.mol -oinchi']; + [~, result] = system(command); + result = split(result); + inchiLayersDetail = getInchiData(result{contains(result, 'InChI=1S')}); + layersOfDataInChI(j, 1) = inchiLayersDetail.layers; + idsScore(j) = idsScore(j) + 4 + layersOfDataInChI(j, 1); % scale in formula comparison + reportComparison.(['met_' groupedIDs.metNames{i}]).layersOfDataInChI(j, 1) = ... + layersOfDataInChI(j, 1); + delete([currentDir filesep 'tmp']) + delete([currentDir filesep 'tmp.mol']) + + end + end + + % Continue report + reportComparison.(['met_' groupedIDs.metNames{i}]).ids(:, 1) = IDsArray(i, ia); + reportComparison.(['met_' groupedIDs.metNames{i}]).idsScore = idsScore; + toSaveidx = find(ismember(ic, ic(ia(idsScore == max(idsScore))))); + reportComparison.sourcesToSave{i, :} = strjoin(reportComparison.sources(toSaveidx), ' '); + else + reportComparison.sourcesToSave{i, :} = ''; + end +end +reportComparison.comparison = comparison; +end diff --git a/src/dataIntegration/chemoInformatics/inchi/getInchiData.m b/src/dataIntegration/chemoInformatics/inchi/getInchiData.m new file mode 100644 index 0000000000..14d04cdc36 --- /dev/null +++ b/src/dataIntegration/chemoInformatics/inchi/getInchiData.m @@ -0,0 +1,70 @@ +function inchiLayersDetail = getInchiData(inchi) +% Classify the inchi according to its various layers of information +% +% USAGE: +% +% detailLevelInchi = inchiDetail(inchi) +% +% INPUTS: +% inchi: String with the InChI to classify +% +% OUTPUTS: +% detailLevelInchi: Struct file with the following fields: +% +% * .layers - Number of layers in the InChI. +% * .standart - Logical, indicates whether the inchi +% is standard or not. +% * .metFormulas - The molecula formula. +% * .positiveCharges - Number of positive charges. +% * .negativeCharges - Number of negative charges. +% * .netCharge - Summ of the charges. +% * .stereochemicalLayer - Logical, indicates whether +% the inchi represent +% stereochemical information +% or not. +% * .isotopicLayer - Logical, indicates whether the +% inchi represent isotopic +% information or not. + +inchiSplited = split(inchi, '/'); + +% Check inchi layers +inchiLayersDetail.layers = numel(inchiSplited); + +% Check if it is a standard inchi +assert(contains(inchiSplited{1}, 'InChI='), [inchi ' is not an InChI']) +if contains(inchiSplited{1}, '1S') + inchiLayersDetail.standart = true; +else + inchiLayersDetail.standart = false; +end + +% Chemical formula +inchiLayersDetail.metFormulas = inchiSplited{2}; + +% Charge layer +pLayer = contains(inchiSplited, 'p'); +if any(pLayer) + inchiLayersDetail.positiveCharges = str2double(regexprep(inchiSplited{pLayer}, 'p-|;', '')); +else + inchiLayersDetail.positiveCharges = 0; +end +qLayer = contains(inchiSplited, 'q'); +if any(qLayer) + inchiLayersDetail.negativeCharges = str2double(regexprep(inchiSplited{qLayer}, 'q\+|;', '')); +else + inchiLayersDetail.negativeCharges = 0; +end +inchiLayersDetail.netCharge = inchiLayersDetail.positiveCharges - inchiLayersDetail.negativeCharges; + +% Stereochemical layer +if any(~cellfun(@isempty, regexp(inchiSplited, 'b|t|m|s'))) + inchiLayersDetail.stereochemicalLayer = true; +end + +% Isotopic layer +if any(~cellfun(@isempty, regexp(inchiSplited, 'i|h'))) + inchiLayersDetail.isotopicLayer = true; +end + +end \ No newline at end of file diff --git a/src/dataIntegration/chemoInformatics/obtainAtomMappingsRDT.m b/src/dataIntegration/chemoInformatics/obtainAtomMappingsRDT.m index 5e8ea34b1a..65f96b43bb 100644 --- a/src/dataIntegration/chemoInformatics/obtainAtomMappingsRDT.m +++ b/src/dataIntegration/chemoInformatics/obtainAtomMappingsRDT.m @@ -1,10 +1,13 @@ -function standardisedRxns = obtainAtomMappingsRDT(model, molFileDir, outputDir, rxnsToAM, hMapping, maxTime, standariseRxn) -% Compute atom mappings for reactions with implicit hydrogens in a -% metabolic network using RDT algorithm +function atomMappingReport = obtainAtomMappingsRDT(model, molFileDir, rxnDir, rxnsToAM, hMapping, onlyUnmapped) +% Using the reaction decoder tool, compute atom mappings for reactions in a +% COBRA model. Atom mapping data is presented in a variety of formats, +% including MDL RXN, SMILES, and images. If this option is selected, the +% function can remove all protons from the model and represent the +% reactions as a hydrogen suppressed chemical graph. % % USAGE: % -% standardisedRxns = obtainAtomMappingsRDT(model, molFileDir, outputDir, rxnsToAM, hMapping, maxTime, standariseRxn) +% standardisedRxns = obtainAtomMappingsRDT(model, molFileDir, rxnDir, rxnsToAM, hMapping, maxTime, standariseRxn) % % INPUTS: % model: COBRA model with following fields: @@ -24,16 +27,14 @@ % reaction identifiers in input mets. % % OPTIONAL INPUTS: -% rxnsToAM: List of reactions to atom map (default: all in the +% rxnsToAM: List of reactions to atom map (default: all in the % model). % hMapping: Logic value to select if hydrogen atoms will be atom % mapped (default: TRUE). % rxnDir: Path to directory that will contain the RXN files with % atom mappings (default: current directory). -% maxTime: Maximum time assigned to compute atom mapping (default -% 1800s). -% standariseRxn: Logic value for standardising the atom mapped RXN file. -% ChemAxon license is required (default: TRUE). +% onlyUnmapped: Logic value to select create only unmapped MDL RXN +% files (default: FALSE). % % OUTPUTS: % balancedRxns: List of standadised atom mapped reactions. @@ -56,11 +57,11 @@ % % .. Author: - German A. Preciat Gonzalez 25/05/2017 -if nargin < 3 || isempty(outputDir) - outputDir = [pwd filesep]; +if nargin < 3 || isempty(rxnDir) + rxnDir = [pwd filesep]; else % Make sure input path ends with directory separator - outputDir = [regexprep(outputDir,'(/|\\)$',''), filesep]; + rxnDir = [regexprep(rxnDir,'(/|\\)$',''), filesep]; end if nargin < 4 || isempty(rxnsToAM) rxnsToAM = model.rxns; @@ -68,36 +69,37 @@ if nargin < 5 || isempty(hMapping) hMapping = true; end -if nargin < 6 || isempty(maxTime) - maxTime = 1800; -end -if nargin < 7 || isempty(standariseRxn) - standariseRxn = true; -end -if exist('rtdDir','var') - rtdDir = [pwd filesep]; -else - % Make sure input path ends with directory separator - rtdDir = [regexprep(rdtDir,'(/|\\)$',''), filesep]; +if nargin < 6 || isempty(onlyUnmapped) + onlyUnmapped = false; end + +maxTime = 1800; + +% Check if JAVA is installed +[javaInstalled, ~] = system('java'); + % Generating new directories -if ~exist([outputDir filesep 'rxnFiles'],'dir') - mkdir([outputDir filesep 'rxnFiles']) +if ~exist([rxnDir filesep 'unMapped'],'dir') + mkdir([rxnDir filesep 'unMapped']) end -if ~exist([outputDir filesep 'atomMapped'],'dir') - mkdir([outputDir filesep 'atomMapped']) -end -if ~exist([outputDir filesep 'images'],'dir') - mkdir([outputDir filesep 'images']) -end -if ~exist([outputDir filesep 'txtData'],'dir') - mkdir([outputDir filesep 'txtData']) +if javaInstalled && ~onlyUnmapped + if ~exist([rxnDir filesep 'atomMapped'],'dir') + mkdir([rxnDir filesep 'atomMapped']) + end + if ~exist([rxnDir filesep 'images'],'dir') + mkdir([rxnDir filesep 'images']) + end + if ~exist([rxnDir filesep 'txtData'],'dir') + mkdir([rxnDir filesep 'txtData']) + end end % Download the RDT algorithm, if it is not present in the output directory -if exist([rdtDir filesep 'rdtAlgorithm.jar'])~=2 - urlwrite('https://github.com/asad/ReactionDecoder/releases/download/v2.1.0/rdt-2.1.0-SNAPSHOT-jar-with-dependencies.jar',[rdtDir filesep 'rdtAlgorithm.jar']); - %urlwrite('https://github.com/asad/ReactionDecoder/releases/download/1.5.1/rdt-1.5.1-SNAPSHOT-jar-with-dependencies.jar',[outputDir filesep 'rdtAlgorithm.jar']); +if exist([rxnDir filesep 'rdtAlgorithm.jar']) ~= 2 && javaInstalled && ~onlyUnmapped + urlwrite('https://github.com/asad/ReactionDecoder/releases/download/v2.4.1/rdt-2.4.1-jar-with-dependencies.jar',[rxnDir filesep 'rdtAlgorithm.jar']); + % Previous releases: + % urlwrite('https://github.com/asad/ReactionDecoder/releases/download/v2.1.0/rdt-2.1.0-SNAPSHOT-jar-with-dependencies.jar',[outputDir filesep 'rdtAlgorithm.jar']); + % urlwrite('https://github.com/asad/ReactionDecoder/releases/download/1.5.1/rdt-1.5.1-SNAPSHOT-jar-with-dependencies.jar',[outputDir filesep 'rdtAlgorithm.jar']); end % Delete the protons (hydrogens) for the metabolic network @@ -114,20 +116,20 @@ model.S = S; end - % Format inputs mets = model.mets; fmets = regexprep(mets, '(\[\w\])', ''); rxns = rxnsToAM; +rxnsInModel = model.rxns; clear model % Get list of MOL files d = dir(molFileDir); d = d(~[d.isdir]); aMets = {d.name}'; -aMets = aMets(~cellfun('isempty',regexp(aMets,'(\.mol)$'))); +aMets = aMets(~cellfun('isempty', regexp(aMets,'(\.mol)$'))); % Identifiers for atom mapped reactions -aMets = regexprep(aMets, '(\.mol)$',''); +aMets = regexprep(aMets, '(\.mol)$', ''); assert(~isempty(aMets), 'MOL files directory is empty or nonexistent.'); % Extract MOL files @@ -140,72 +142,203 @@ % Create the RXN files. Three conditions are required: 1) To have all the % MOL files in the reaction, 2) No exchange reactions, 3) Only integers in % the stoichiometry -for i=1:length(rxns) - a = ismember(regexprep(mets(find(S(:,i))), '(\[\w\])', ''), aMets); - s = S(find(S(:, i)), i); - if all(a(:) > 0) && length(a) ~= 1 && all(abs(round(s) - s) < (1e-2)) - writeRxnfile(S(:, i), mets, fmets, molFileDir, rxns{i}, [outputDir... - filesep 'rxnFiles' filesep]) +rxnFilesWrittenBool = false(length(rxns), 1); +for i = 1:length(rxns) + rxnBool = ismember(rxnsInModel, rxns{i}); + metsInRxns = ismember(fmets(find(S(:, rxnBool))), aMets); + stoichiometry = S(find(S(:, rxnBool)), rxnBool); + if ~any(~metsInRxns) && length(metsInRxns) > 1 && all(abs(round(stoichiometry) - stoichiometry) < (1e-2)) + writeRxnfile(S(:, rxnBool), mets, fmets, molFileDir, rxns{i}, [rxnDir... + filesep 'unMapped' filesep]) + rxnFilesWrittenBool(i) = true; end end +atomMappingReport.rxnFilesWritten = rxns(rxnFilesWrittenBool); + +% Get list of new RXN files +d = dir([rxnDir filesep 'unMapped' filesep]); +d = d(~[d.isdir]); +aRxns = regexprep({d.name}', '.rxn', ''); +assert(~isempty(aRxns), 'No rxn file was written.'); +rxnsToAM = rxnsToAM(ismember(rxnsToAM, aRxns)); + +mappedBool = false(length(rxnsToAM), 1); % Atom map RXN files -fnames = dir([outputDir filesep 'rxnFiles' filesep '*.rxn']); -fprintf('Computing atom mappings for %d reactions.\n\n', length(fnames)); - -% Start from the lighter RXN to the heavier -[~,bytes] = sort([fnames.bytes]); -counterBalanced = 0; -counterNotMapped = 0; -counterUnbalanced = 0; -for i=1:length(fnames) - name = [outputDir 'rxnFiles' filesep fnames(bytes(i)).name]; - command = ['timeout ' num2str(maxTime) 's java -jar ' rdtDir filesep 'rdtAlgorithm.jar -Q RXN -q "' name '" -g -j AAM -f TEXT']; - if ismac - command = ['g' command]; - end - [status, result] = system(command); - if status ~= 0 - fprintf(result); - error('Command %s could not be run.\n', command); +if javaInstalled == 1 && ~onlyUnmapped + fprintf('Computing atom mappings for %d reactions.\n\n', length(rxnsToAM)); + for i = 1:length(rxnsToAM) + + name = [rxnDir 'unMapped' filesep rxnsToAM{i} '.rxn']; + command = ['timeout ' num2str(maxTime) 's java -jar ' rxnDir 'rdtAlgorithm.jar -Q RXN -q "' name '" -g -j AAM -f TEXT']; + + if ismac + command = ['g' command]; + end + [status, result] = system(command); + if contains(result, 'file not found!') + warning(['The file ' name ' was not found']) + end + if ~status + fprintf(result); + error('Command %s could not be run.\n', command); + end + mNames = dir('ECBLAST_*'); + if length(mNames) == 3 + name = regexprep({mNames.name}, 'ECBLAST_|_AAM', ''); + cellfun(@movefile, {mNames.name}, name) + cellfun(@movefile, name, {[rxnDir 'images'], [rxnDir... + 'atomMapped'], [rxnDir 'txtData']}) + mappedBool(i) = true; + elseif ~isempty(mNames) + delete(mNames.name) + end + end - mNames = dir('ECBLAST_*'); - if length(mNames) == 3 - name = regexprep({mNames.name}, 'ECBLAST_|_AAM', ''); - cellfun(@movefile, {mNames.name}, name) - cellfun(@movefile, name, {[outputDir 'images'], [outputDir... - 'atomMapped'], [outputDir 'txtData']}) - elseif ~isempty(mNames) - delete(mNames.name) - counterNotMapped = counterNotMapped + 1; - else - counterNotMapped = counterNotMapped + 1; - end -end - -% Standarize reactions -if standariseRxn == true - fnames = dir([outputDir filesep 'atomMapped' filesep '*.rxn']); - for i = 1:length(fnames) - standardised = canonicalRxn(fnames(i).name, [outputDir... - 'atomMapped'], [outputDir 'rxnFiles']); - if standardised - counterBalanced = counterBalanced + 1; - standardisedRxns{counterBalanced} = regexprep(fnames(i).name, '.rxn', ''); - else - counterUnbalanced = counterUnbalanced + 1; + mappedRxns = rxnsToAM(mappedBool); + atomMappingReport.mappedRxns = mappedRxns; + + delete([rxnDir 'rdtAlgorithm.jar']) + + for i = 1:length(mappedRxns) + + name = [mappedRxns{i} '.rxn']; + + % Add header + mappedFile = regexp(fileread([rxnDir 'atomMapped' filesep name]), '\n', 'split')'; + standardFile = regexp(fileread([rxnDir 'unMapped' filesep name]), '\n', 'split')'; + mappedFile{2} = standardFile{2}; + mappedFile{3} = ['COBRA Toolbox v3.0 - Atom mapped - ' datestr(datetime)]; + mappedFile{4} = standardFile{4}; + + formula = strsplit(mappedFile{4}, {'->', '<=>'}); + + substratesFormula = strtrim(strsplit(formula{1}, '+')); + % Check if a metabolite is repeated in the substrates formula + repMetsSubInx = find(~cellfun(@isempty, regexp(substratesFormula, ' '))); + if ~isempty(repMetsSubInx) + for j = 1:length(repMetsSubInx) + metRep = strsplit(substratesFormula{repMetsSubInx(j)}); + timesRep = str2double(metRep{1}); + metRep = metRep{2}; + substratesFormula{repMetsSubInx(j)} = strjoin(repmat({metRep}, [1 timesRep])); + end + substratesFormula = strsplit(strjoin(substratesFormula)); + end + + productsFormula = strtrim(strsplit(formula{2}, '+')); + % Check if a metabolite is repeated in the products formula + repMetsProInx = find(~cellfun(@isempty, regexp(productsFormula, ' '))); + if ~isempty(repMetsProInx) + for j = 1:length(repMetsProInx) + metRep = strsplit(productsFormula{repMetsProInx(j)}); + timesRep = str2double(metRep{1}); + metRep = metRep{2}; + productsFormula{repMetsProInx(j)} = strjoin(repmat({metRep}, [1 timesRep])); + end + productsFormula = strsplit(strjoin(productsFormula)); end + + % RXN file data + begmol = strmatch('$MOL', mappedFile); + noOfMolSubstrates = str2double(mappedFile{5}(1:3)); + substratesMol = mappedFile(begmol(1:noOfMolSubstrates) + 1)'; + noOfMolProducts = str2double(mappedFile{5}(4:6)); + productsMol = mappedFile(begmol(noOfMolSubstrates + 1:noOfMolSubstrates + noOfMolProducts) + 1)'; + + % Formula data + noOfsubstrates = numel(substratesFormula); + noOfproducts = numel(productsFormula); + + % Check if the stoichemestry is correct + if ~isequal(noOfsubstrates, substratesMol) || ~isequal(noOfproducts, productsMol) + mappedFile = sortMets(mappedFile, substratesMol, substratesFormula, productsMol, productsFormula, rxnDir); + end + % Rewrite the file + fid2 = fopen([rxnDir 'atomMapped' filesep name], 'w'); + fprintf(fid2, '%s\n', mappedFile{:}); + fclose(fid2); end else - standardisedRxns = []; - counterUnbalanced = length(dir([outputDir 'atomMapped' filesep '*.rxn'])); + atomMappingReport.mappedRxns = []; end -fprintf('\n%d reactions were atom mapped\n', length(dir([outputDir 'atomMapped' filesep '*.rxn']))); -fprintf('%d reactions are not standardised\n', counterUnbalanced); -fprintf('%d reactions were not mapped\n\n\n', counterNotMapped); +% fprintf('\n%d reactions were atom mapped\n', length(dir([outputDir 'atomMapped' filesep '*.rxn']))); +% fprintf('%d reactions are not standardised\n', counterUnbalanced); +% fprintf('%d reactions were not mapped\n\n\n', counterNotMapped); +% +% fprintf('RDT algorithm was developed by:\n'); +% fprintf('SA Rahman et al.: Reaction Decoder Tool (RDT): Extracting Features from Chemical\n'); +% fprintf('Reactions, Bioinformatics (2016), doi: 10.1093/bioinformatics/btw096\n'); +end + + +function newFile = sortMets(mappedFile, substratesMol, substratesFormula, productsMol, productsFormula, outputDir) +% Function to sort the metabolites as in the model's stoichiometry -fprintf('RDT algorithm was developed by:\n'); -fprintf('SA Rahman et al.: Reaction Decoder Tool (RDT): Extracting Features from Chemical\n'); -fprintf('Reactions, Bioinformatics (2016), doi: 10.1093/bioinformatics/btw096\n'); +begmol = strmatch('$MOL', mappedFile); + +% Check if bondless atoms were divided +if numel(substratesFormula) ~= numel(substratesMol) || numel(productsFormula) ~= numel(productsMol) + + if ~exist([outputDir filesep 'atomMapped' filesep 'toFix'],'dir') + mkdir([outputDir filesep 'atomMapped' filesep 'toFix']) + end + copyfile([outputDir filesep 'atomMapped' filesep mappedFile{2} '.rxn'], [outputDir filesep 'atomMapped' filesep 'toFix']) + newFile(1:5, 1) = mappedFile(1:5); +else + + newFile(1:5, 1) = mappedFile(1:5); + + %%% Sort substrates + [~,idm] = sort(substratesMol); + [~,ids] = sort(substratesFormula); + [~,ids] = sort(ids); + indexes = idm(ids); + + % Save each metabolite + for k = 1:numel(substratesFormula) + lineInMol = 1; + eval(sprintf('molS%d{%d} = mappedFile{begmol(%d)};', k, lineInMol, k)) + while ~isequal(mappedFile{begmol(k) + lineInMol}, 'M END') + eval(sprintf('molS%d{%d + 1} = mappedFile{begmol(%d) + %d};', k, lineInMol, k, lineInMol)) + lineInMol = lineInMol + 1; + end + eval(sprintf('molS%d{%d + 1} = ''M END'';', k, lineInMol)) + end + % From the start of the mol files + c = 5; + for k = 1:numel(substratesFormula) + eval(sprintf('noOfLines = numel(molS%d);', indexes(k))) + for j = 1:noOfLines + c = c + 1; + eval(sprintf('newFile{%d} = molS%d{%d};', c, indexes(k), j)) + end + end + + %%% Sort products + [~,idmp] = sort(productsMol); + [~,idp] = sort(productsFormula); + [~,idp] = sort(idp); + indexes = idmp(idp); + + for i = numel(substratesFormula) + 1:numel(substratesFormula) + numel(productsFormula) + lineInMol=1; + eval(sprintf('molP%d{%d} = mappedFile{begmol(%d)};', i - numel(substratesFormula), lineInMol, i)) + while ~isequal(mappedFile{begmol(i) + lineInMol}, 'M END') + eval(sprintf('molP%d{%d + 1} = mappedFile{begmol(%d) + %d};', i - numel(substratesFormula), lineInMol, i, lineInMol)) + lineInMol = lineInMol + 1; + end + eval(sprintf('molP%d{%d + 1} = ''M END'';', i - numel(substratesFormula), lineInMol)) + end + for i = numel(substratesFormula) + 1:numel(substratesFormula) + numel(productsFormula) + molName = regexprep(productsFormula{i - numel(substratesFormula)}, '\[|\]', '_'); + eval(sprintf('noOfLines = numel(molP%d);', indexes(i - numel(substratesFormula)))) + for j = 1:noOfLines + c = c + 1; + eval(sprintf('newFile{%d} = molP%d{%d};', c, indexes(i - numel(substratesFormula)), j)) + end + end +end +end \ No newline at end of file diff --git a/src/dataIntegration/chemoInformatics/obtainMetStructures.m b/src/dataIntegration/chemoInformatics/obtainMetStructures.m new file mode 100644 index 0000000000..285317ba26 --- /dev/null +++ b/src/dataIntegration/chemoInformatics/obtainMetStructures.m @@ -0,0 +1,379 @@ +function molCollectionReport = obtainMetStructures(model, outputDir, updateDB, standardisationApproach, orderOfPreference) +% Obtain MDL MOL files from various databases, including KEGG, HMDB, ChEBI, +% and PubChem. Alternatively, openBabel can be used to convert InChI +% strings or SMILES in MDL MOL files. +% +% USAGE: +% +% missingMolFiles = obtainMetStructures(model, outputDir, updateDB, standardisationApproach, orderOfPreference) +% +% INPUTS: +% model: COBRA model with following fields: +% +% * .S - The m x n stoichiometric matrix for the +% metabolic network. +% * .mets - An m x 1 array of metabolite identifiers. +% * .metInChIString - An m x 1 array of metabolite identifiers. +% * .metSmiles - An m x 1 array of metabolite identifiers. +% * .metVMHID - An m x 1 array of metabolite identifiers. +% * .metCHEBIID - An m x 1 array of metabolite identifiers. +% * .metKEGGID - An m x 1 array of metabolite identifiers. +% * .metPubChemID - An m x 1 array of metabolite identifiers. +% * .metHMDBID - An m x 1 array of metabolite identifiers. +% +% OPTIONAL INPUTS: +% outputDir: Path to directory that will contain the MOL files +% (default: current directory). +% updateDB: Logical value idicating if the database will be +% updated or not. If it's true, "outputDir" should +% contain an existing database (default: false). +% standardisationApproach: String contianing the type of standarization for +% the moldecules (default: empty) +% * explicitH - Normal chemical graphs. +% * implicitH - Hydrogen suppressed chemical +% graphs. +% * Neutral - Chemical graphs with protonated +% molecules. +% * basic - Adding the header. +% orderOfPreference: Vector indicating the source of preference +% (default: 1:7) +% 1.- VMH (http://vmh.life/) +% 2.- InChI (requires openBabel) +% 3.- Smiles (requires openBabel) +% 4.- KEGG (https://www.genome.jp/) +% 5.- HMDB (https://hmdb.ca/) +% 6.- PubChem (https://pubchem.ncbi.nlm.nih.gov/) +% 7.- CHEBI (https://www.ebi.ac.uk/) +% +% OUTPUTS: +% missingMolFiles: List of missing MOL files +% nonStandardised: List of non-standardised MDL MOL file. + +if nargin < 2 || isempty(outputDir) + outputDir = [pwd filesep]; +else + % Make sure input path ends with directory separator + outputDir = [regexprep(outputDir,'(/|\\)$',''), filesep]; +end +if nargin < 3 + updateDB = false; +end +if nargin < 4 + standardisationApproach = []; +end +if nargin < 5 + orderOfPreference = 1:7; +end + +[oBabelInstalled, ~] = system('obabel'); +webTimeout = weboptions('Timeout', 30); + +% Set directories +if exist([outputDir 'newMol'], 'dir') == 0 + mkdir([outputDir 'newMol']) +end +newMolFilesDir = [outputDir 'newMol' filesep]; +if updateDB + if exist([outputDir 'met' filesep standardisationApproach filesep], 'dir') ~= 0 + modelMets = regexprep(model.mets,'(\[\w\])',''); + fnames = dir([newMolFilesDir '*.mol']); + model = removeMetabolites(model, model.mets(~ismember(modelMets, setdiff(modelMets, split([fnames(:).name], '.mol'))))); + else + display('Directory with MOL files was not found to be updated in:') + display([outputDir 'met' filesep standardisationApproach filesep]) + display('A new database will be created') + end +end + +%% Obtain met data + +% Obtain ID's +fields = fieldnames(model); +% inchi +inchiFieldBool = ~cellfun(@isempty, regexpi(fields, 'inchi')); +if any(inchiFieldBool) + inchis = model.(fields{inchiFieldBool}); +end +% SMILES +smilesFieldBool = ~cellfun(@isempty, regexpi(fields, 'smiles')); +if any(smilesFieldBool) + smiles = model.(fields{smilesFieldBool}); +end +% HMDB +hmdbFieldBool = ~cellfun(@isempty, regexpi(fields, 'hmdb')); +if any(hmdbFieldBool) + hmdbIDs = model.(fields{hmdbFieldBool}); +end +% KEGG +keggFieldBool = ~cellfun(@isempty, regexpi(fields, 'kegg')); +if any(keggFieldBool) + if sum(keggFieldBool) > 1 + metFieldBool = ~cellfun(@isempty, regexpi(fields, 'met')); + keggFieldBool = keggFieldBool & metFieldBool; + end + keggIDs = model.(fields{keggFieldBool}); +end +% PubChem +PubChemFieldBool = ~cellfun(@isempty, regexpi(fields, 'PubChem')); +if any(PubChemFieldBool) + PubChemIDs = model.(fields{PubChemFieldBool}); +end +% chebi +chebiFieldBool = ~cellfun(@isempty, regexpi(fields, 'chebi')); +if any(chebiFieldBool) + chebiIDs = model.(fields{chebiFieldBool}); +end + +%% Obtain met structures + +% Unique metabolites idexes +[umets, ia] = unique(regexprep(model.mets, '(\[\w\])', '')); +% umets = model.mets; +% ia = 1:numel(model.mets); + +missingMetBool = true(length(umets), 1); +% Obtain MDL MOL files +idsToCheck = {}; +for i = 1:length(umets) + for j = 1:7 + switch orderOfPreference(j) + + case 1 + % VMH + % if prod(~isnan(VMH{metIdxs(i)})) && ~isempty(VMH{metIdxs(i)}) && exist('VMH', 'var') && missing + % + % end + + case 2 % inchi + if prod(~isnan(inchis{ia(i)})) && ~isempty(inchis{ia(i)}) && oBabelInstalled && missingMetBool(i) + try + fid2 = fopen([outputDir 'tmp'], 'w'); + fprintf(fid2, '%s\n', inchis{ia(i)}); + fclose(fid2); + command = ['obabel -iinchi ' outputDir 'tmp -O ' newMolFilesDir umets{i} '.mol mol']; + [status, cmdout] = system(command); + if contains(cmdout, '1 molecule converted') + missingMetBool(i) = false; + end + delete([outputDir 'tmp']) + catch ME + disp(ME.message) + idsToCheck(end + 1, 1) = inchis(ia(i)); + end + end + + case 3 % Smiles + if prod(~isnan(smiles{ia(i)})) && ~isempty(smiles{ia(i)}) && oBabelInstalled && missingMetBool(i) + try + fid2 = fopen([outputDir 'tmp'], 'w'); + fprintf(fid2, '%s\n', smiles{ia(i)}); + fclose(fid2); + command = ['obabel -ismi ' outputDir 'tmp -O ' newMolFilesDir umets{i} '.mol mol']; + [status,cmdout] = system(command); + if status == 0 + missingMetBool(i) = false; + end + delete([outputDir 'tmp']) + catch ME + disp(ME.message) + idsToCheck(end + 1, 1) = smiles(ia(i)); + end + end + + case 4 % KEGG + if prod(~isnan(keggIDs{ia(i)})) && ~isempty(keggIDs{ia(i)}) && missingMetBool(i) + try + switch keggIDs{ia(i)}(1) + case 'C' + molFile = webread(['https://www.genome.jp/dbget-bin/www_bget?-f+m+compound+' keggIDs{ia(i)}], webTimeout); + case 'D' + molFile = webread(['https://www.kegg.jp/dbget-bin/www_bget?-f+m+drug+' keggIDs{ia(i)}], webTimeout); + end + if ~isempty(regexp(molFile, 'M END')) + fid2 = fopen([newMolFilesDir umets{i} '.mol'], 'w'); + fprintf(fid2, '%s\n', molFile); + fclose(fid2); + missingMetBool(i) = false; + end + catch ME + disp(ME.message) + idsToCheck(end + 1, 1) = keggIDs(ia(i)); + end + end + + case 5 % HMDB + if prod(~isnan(hmdbIDs{ia(i)})) && ~isempty(hmdbIDs{ia(i)}) && missingMetBool(i) + try + numbersID = hmdbIDs{ia(i)}(5:end); + if size(numbersID, 2) < 7 + numbersID = [repelem('0', 7 - size(numbersID, 2)) numbersID]; + end + molFile = webread(['https://hmdb.ca/structures/metabolites/HMDB' numbersID '.mol'], webTimeout); + if ~isempty(regexp(molFile, 'M END')) + fid2 = fopen([newMolFilesDir umets{i} '.mol'], 'w'); + fprintf(fid2, '%s\n', molFile); + fclose(fid2); + missingMetBool(i) = false; + end + catch ME + disp(ME.message) + idsToCheck(end + 1, 1) = hmdbIDs(ia(i)); + end + end + + case 6 % PubChem + if prod(~isnan(PubChemIDs{ia(i)})) && ~isempty(PubChemIDs{ia(i)}) && missingMetBool(i) + try + molFile = webread(['https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/CID/'... + num2str(PubChemIDs{ia(i)}) ... + '/record/SDF/?record_type=2d&response_type=display'], webTimeout); + % Delete all after 'M END' from the SDF filte to + % make it MOL file + if ~isempty(regexp(molFile, 'M END')) + molFile(regexp(molFile, 'M END') + 6:end) = []; + fid2 = fopen([newMolFilesDir umets{i} '.mol'], 'w'); + fprintf(fid2, '%s\n', molFile); + fclose(fid2); + missingMetBool(i) = false; + end + catch ME + disp(ME.message) + idsToCheck(end + 1, 1) = PubChemIDs(ia(i)); + end + end + + case 7 % ChEBI + if prod(~isnan(chebiIDs{ia(i)})) && ~isempty(chebiIDs{ia(i)}) && missingMetBool(i) + try + molFile = webread(['https://www.ebi.ac.uk/chebi/saveStructure.do?defaultImage=true&chebiId=' num2str(chebiIDs{ia(i)}) '&imageId=0'], webTimeout); + if ~isempty(regexp(molFile, 'M END')) + fid2 = fopen([newMolFilesDir umets{i} '.mol'], 'w'); + fprintf(fid2, '%s\n', molFile); + fclose(fid2); + missingMetBool(i) = false; + end + catch ME + disp(ME.message) + idsToCheck(end + 1, 1) = chebiIDs(ia(i)); + end + end + end + end +end + +%% Standardise Mol Files + +if ~isempty(standardisationApproach) + + % Set up directories + switch standardisationApproach + case 'explicitH' + standardisedDir = [outputDir 'explicitH' filesep]; + case 'implicitH' + standardisedDir = [outputDir 'implicitH' filesep]; + case 'protonated' + standardisedDir = [outputDir 'protonated' filesep]; + otherwise + standardisationApproach = 'basic'; + standardisedDir = molDir; + end + + % Standardise files + umets(missingMetBool) = []; + standardisationReport = standardiseMolDatabase(tmpDir, umets, standardisedDir, standardisationApproach); + + % Get SMILES and InChIs + if isfield(standardisationReport, 'SMILES') + SMILES = standardisationReport.SMILES; + else + SMILES = ''; + end + if isfield(standardisationReport, 'InChIs') + InChIs = standardisationReport.InChIs; + else + InChIs = ''; + end + % Delete empty cells + InChIs(cellfun(@isempty, InChIs)) = []; + SMILES(cellfun(@isempty, SMILES)) = []; + + if updateDB && ~isempty(InChIs) && ~isempty(SMILES) + + % For InChIs + if isfile([standardisedDir 'InChIs']) + % Merge old and new InChIs + InChIsFile = regexp( fileread([standardisedDir 'InChIs']), '\n', 'split')'; + InChIsFile(cellfun(@isempty, InChIsFile)) = []; + InChIsFileSp = split(InChIsFile, ' - '); + smilesSp = split(InChIs, ' - '); + mergedSmiles(:, 2) = unique([InChIsFileSp(:, 2); smilesSp(:, 2)]); + mergedSmiles(ismember(mergedSmiles(:, 2), smilesSp(:, 2)), 1) = smilesSp(:, 2); + mergedSmiles(ismember(mergedSmiles(:, 2), InChIsFileSp(:, 2)), 1) = InChIsFileSp(:, 2); + mergedSmiles = strcat(mergedSmiles(:, 1), {' - '}, mergedSmiles(:, 2)); + % Write InChIs + fid2 = fopen([standardisedDir 'InChIs'], 'w'); + fprintf(fid2, '%s\n', mergedSmiles{:}); + fclose(fid2); + else + % Write InChIs + fid2 = fopen([standardisedDir 'InChIs'], 'w'); + fprintf(fid2, '%s\n', InChIs{:}); + fclose(fid2); + end + + % For SMILES + if isfile([standardisedDir 'SMILES']) + % Merge old and new InChIs + smilesFile = regexp( fileread([standardisedDir 'SMILES']), '\n', 'split')'; + smilesFile(cellfun(@isempty, smilesFile)) = []; + smilesFileSp = split(smilesFile, ' - '); + smilesSp = split(SMILES, ' - '); + mergedSmiles(:, 2) = unique([smilesFileSp(:, 2); smilesSp(:, 2)]); + mergedSmiles(ismember(mergedSmiles(:, 2), smilesSp(:, 2)), 1) = smilesSp(:, 2); + mergedSmiles(ismember(mergedSmiles(:, 2), smilesFileSp(:, 2)), 1) = smilesFileSp(:, 2); + mergedSmiles = strcat(mergedSmiles(:, 1), {' - '}, mergedSmiles(:, 2)); + % Write InChIs + fid2 = fopen([standardisedDir 'SMILES'], 'w'); + fprintf(fid2, '%s\n', mergedSmiles{:}); + fclose(fid2); + else + % Write InChIs + fid2 = fopen([standardisedDir 'SMILES'], 'w'); + fprintf(fid2, '%s\n', SMILES{:}); + fclose(fid2); + end + + else + % Write InChIs + fid2 = fopen([standardisedDir 'InChIs'], 'w'); + fprintf(fid2, '%s\n', InChIs{:}); + fclose(fid2); + % Write SMILES + fid2 = fopen([standardisedDir 'SMILES'], 'w'); + fprintf(fid2, '%s\n', SMILES{:}); + fclose(fid2); + end +end + +%% Report + +% Make report +molCollectionReport.noOfMets = size(umets, 1); +molCollectionReport.noOfMetsWithMol = sum(~missingMetBool); +molCollectionReport.noOfMetsWithoutMol = sum(missingMetBool); +molCollectionReport.coverage = (molCollectionReport.noOfMetsWithMol * 100) / molCollectionReport.noOfMets; + +% Check standardised data +if ~isempty(standardisationApproach) + nRows = size(standardisationReport.SMILES, 1); + varTypes = {'string', 'string', 'string', 'string'}; + varNames = {'mets', 'InChIKeys', 'InChIs', 'SMILES'}; + molCollectionReport.standardisationReport = table('Size', [nRows length(varTypes)], 'VariableTypes', varTypes, 'VariableNames', varNames); + molCollectionReport.standardisationApproach = standardisationApproach; + molCollectionReport.standardisationReport(1:end) = standardisationReport.standardised; + molCollectionReport.standardisationReport.InChIKeys(1:size(standardisationReport.InChIKeys, 1)) = standardisationReport.InChIKeys; + molCollectionReport.standardisationReport.InChIs(1:size(standardisationReport.InChIs, 1)) = standardisationReport.InChIs; + molCollectionReport.standardisationReport.SMILES(1:size(standardisationReport.SMILES, 1)) = standardisationReport.SMILES; +end + +end \ No newline at end of file diff --git a/src/dataIntegration/chemoInformatics/standardiseMolDatabase.m b/src/dataIntegration/chemoInformatics/standardiseMolDatabase.m new file mode 100644 index 0000000000..e49af35f31 --- /dev/null +++ b/src/dataIntegration/chemoInformatics/standardiseMolDatabase.m @@ -0,0 +1,212 @@ +function standardisationReport = standardiseMolDatabase(molDir, metList, standardisedDir, standardisationApproach) +% Standardize an MDL MOL file directory by representing the reaction using +% normal chemical graphs, hydrogen suppressed chemical graphs, and chemical +% graphs with protonated molecules. The function also updates the header +% with the standardization information. It makes use of ChemAxon's +% standardizer and openBabel. +% +% USAGE: +% +% [standardised, nonStandardised, InChIs, SMILES] = standardiseMolDatabase(molDir, standardisedDir, standardiseMolFiles) +% +% INPUTS: +% molDir: Path to directory that contain the MDL MOL files +% to be standardised. +% +% OPTIONAL INPUTS: +% metList: 1 x n lest of metabolites to standardise +% standardisedDir Path to directory that will contain the +% standardised MDL MOL files (default: current +% directory). +% standardisationApproach: String contianing the type of standarization for +% the moldecules (default: 'explicitH' if +% openBabel is installed, otherwise 'basic') +% * explicitH - Normal chemical graphs. +% * implicitH - Hydrogen suppressed chemical +% graphs. +% * neutral - Chemical graphs with protonated +% molecules. +% * basic - Updating the header. +% +% OUTPUTS: +% A standardised molecular structures database +% standardisationReport: Struct array with the standarization report +% * standardised +% * nonStandardised +% * InChIs +% * SMILES +% * InChIKeys +% +% .. Author: - German Preciat 25/06/2020 + + +molDir = [regexprep(molDir,'(/|\\)$',''), filesep]; +if nargin < 2 || isempty(metList) + metList = []; +end +if nargin < 3 || isempty(standardisedDir) + standardisedDir = [pwd filesep]; +else + % Make sure input path ends with directory separator + standardisedDir = [regexprep(standardisedDir,'(/|\\)$',''), filesep]; +end +% Check if openBabel is installed +if nargin < 4 || isempty(standardisationApproach) + standardisationApproach = 'explicitH'; +end + +% Check if chemAxon and OpenBabel are installed +[oBabelInstalled, ~] = system('obabel'); +[marvinInstalled, ~] = system('cxcalc'); +if marvinInstalled ~= 0 + marvinInstalled = false; +end +if oBabelInstalled ~= 1 + oBabelInstalled = false; + standardisationApproach = 'basic'; +end + +% Assign directories and create them +standardisedMolFiles = [standardisedDir 'molFiles' filesep]; +if marvinInstalled + standardisedImages = [standardisedDir 'images' filesep]; +end +if ~exist(standardisedMolFiles, 'dir') + mkdir(standardisedMolFiles) + if marvinInstalled + mkdir(standardisedImages) + end +end + +% The new MOL files are readed +% Get list of MOL files +d = dir(molDir); +d = d(~[d.isdir]); +aMets = {d.name}'; +aMets = aMets(~cellfun('isempty', regexp(aMets,'(\.mol)$'))); +if ~isempty(metList) + aMets = aMets(ismember(regexprep(aMets, '.mol', ''), metList)); +end + +fprintf('\n Standardizing %d MOL files ... \n', size(aMets, 1)); + +y = 0; +n = 0; +nonStandardised = []; +InChIs = []; +InChIKeys = []; +for i = 1:size(aMets, 1) +% try + name = aMets{i}; + + if oBabelInstalled + + % Obtain SMILES + command = ['obabel -imol ' molDir name ' -osmiles']; + [~, cmdout] = system(command); + cmdout = splitlines(cmdout); + cmdout = split(cmdout{end - 2}); + smiles = cmdout{1}; + SMILES{i, 1} = smiles; + + % Obtain inChIKey and InChI + % inChIKey + command = ['obabel -imol ' molDir name ' -oinchikey']; + [~, cmdout] = system(command); + cmdout = split(cmdout); + inchikeyIdx = find(cellfun(@numel, cmdout) == 27); + if ~isempty(inchikeyIdx) + InChIKey = ['InChIKey=' cmdout{inchikeyIdx}]; + InChIKeys{i, 1} = InChIKey; + else + InChIKey = ''; + InChIKeys{i, 1} = {''}; + end + % InChI + command = ['obabel -imol ' molDir name ' -oinchi']; + [~, cmdout] = system(command); + cmdout = split(cmdout); + if any(contains(cmdout,'InChI=1S')) + InChIs{i, 1} = cmdout{contains(cmdout,'InChI=1S')}; + fid2 = fopen('tmp', 'w'); + fprintf(fid2, '%s\n', cmdout{contains(cmdout,'InChI=1S')}); + fclose(fid2); + % Create an InChI based-MOL file + command = ['obabel -iinchi tmp -O ' standardisedMolFiles name ' --gen2D']; + [~, ~] = system(command); + delete('tmp') + else + InChIs{i, 1} = {''}; + fid2 = fopen('tmp', 'w'); + fprintf(fid2, '%s\n', smiles); + fclose(fid2); + command = ['obabel -ismiles tmp -O ' standardisedMolFiles name ' --gen2D']; + [~, ~] = system(command); + delete('tmp') + end + + % Adapt database + switch standardisationApproach + case 'explicitH' + command = ['obabel -imol ' standardisedMolFiles name ' -O ' standardisedMolFiles name ' -h']; + [~, ~] = system(command); + case 'implicitH' + % Delete explicit hydrogens + command = ['obabel -imol ' standardisedMolFiles name ' -O ' standardisedMolFiles name ' -d']; + [~, ~] = system(command); + case 'neutral' + % Neutralize molecule + command = ['obabel -imol ' standardisedMolFiles name ' -O ' standardisedMolFiles name ' –neutralize']; + [~, ~] = system(command); + end + else + + InChIKey = ''; + copyfile([molDir name], standardisedMolFiles) + + end + + % Rewrite headings + molFile = regexp(fileread([standardisedMolFiles name]), '\n', 'split')'; + molFile{1} = name(1:end-4); + molFile{2} = ['COBRA Toolbox v3.0 - ' standardisationApproach ' molecule - ' datestr(datetime)]; + molFile{3} = InChIKey; + fid2 = fopen([standardisedMolFiles name], 'w'); + fprintf(fid2, '%s\n', molFile{:}); + fclose(fid2); + % Generate images + if marvinInstalled + fdata = dir([standardisedMolFiles name]); + command = ['molconvert jpeg:w' num2str(fdata.bytes / 10) ',h' num2str(fdata.bytes / 10) ' ' standardisedMolFiles name ' -o ' standardisedImages name(1:end-4) '.jpeg']; + [~, ~] = system(command); + end + + % Save data and delete the non-standardised molecule + y = y + 1; + standardised{y, 1} = name(1:end - 4); + % delete([molDir name]) + +% catch +% +% % Save data in case it is not standardised +% n = n + 1; +% nonStandardised{n, 1} = name(1:end - 4); +% +% end +end + +% Prepare report +standardisationReport.standardisationApproach = standardisationApproach; +standardisationReport.standardised = standardised; +standardisationReport.nonStandardised = nonStandardised; +if exist('InChIs', 'var') + standardisationReport.InChIs = InChIs; +end +if exist('SMILES', 'var') + standardisationReport.SMILES = SMILES; +end +if exist('InChIKeys', 'var') + standardisationReport.InChIKeys = InChIKeys; +end + +end \ No newline at end of file diff --git a/src/dataIntegration/chemoInformatics/transportRxnAM.m b/src/dataIntegration/chemoInformatics/transportRxnAM.m new file mode 100644 index 0000000000..1cc36c0bbe --- /dev/null +++ b/src/dataIntegration/chemoInformatics/transportRxnAM.m @@ -0,0 +1,121 @@ +function mappedRxns = transportRxnAM(rxnDir, outputDir) +% This function atom maps the transport reactions for a given director in +% MDL RXN file format. +% +% USAGE: +% +% mappedRxns = transportRxnAM(rxnDir, outputDir) +% +% INPUTS: +% rxnDir: Path to directory that contains the RXN files +% (default: current directory). +% +% OPTIONAL INPUTS: +% outputDir: Path to directory that will contain the atom +% mapped transport reactions (default: current +% directory). +% +% OUTPUTS: +% mappedRxns: List of missing MOL files atom mapped transport +% reactions. + +rxnDir = [regexprep(rxnDir,'(/|\\)$',''), filesep]; % Make sure input path ends with directory separator +if nargin < 2 || isempty(outputDir) + outputDir = [pwd filesep]; +else + % Make sure input path ends with directory separator + outputDir = [regexprep(outputDir,'(/|\\)$',''), filesep]; +end + +% Create directory if it is missing +if exist(outputDir) ~= 7 + mkdir('transportRxnsAM') +end + +% Check if the directory is not empty +fnames = dir([rxnDir '*.rxn']); +assert(~isempty(fnames), '''rxnDir'' does not contain RXN files'); + +c = 0; +for i = 1:length(fnames) + + % Read the MOL file + rxnFile = regexp( fileread([rxnDir fnames(i).name]), '\n', 'split')'; + rxnFormula = rxnFile{4}; + assert(~isempty(rxnFormula), 'There is not a chemical formula.'); + % Check if it is a transport reaction + rxnFormula = split(rxnFormula, {' -> ', ' <=> '}); + substrates = split(rxnFormula{1}, ' + '); + substrates = expandMets(substrates); + products = split(rxnFormula{2}, ' + '); + products = expandMets(products); + if isequal(substrates, products) + + % Identify the corresponding metabolites in the substrates and + % products + begMol = strmatch('$MOL', rxnFile); + for j = 1:length(begMol) + if j <= numel(substrates) + metSubs{j} = regexprep((rxnFile{begMol(j) + 1}), '(\[\w\])', ''); + else + metProds{j - numel(substrates)} = regexprep((rxnFile{begMol(j) + 1}), '(\[\w\])', ''); + end + end + + % Atom map + atom = 0; + for j = 1:numel(metSubs) + nuOfAtoms = str2double(rxnFile{begMol(j) + 4}(1:3)); + productIdx = strmatch(metSubs{j}, metProds); + for k = 1:nuOfAtoms + atom = atom + 1; + switch length(num2str(atom)) + case 1 + data2print = [' ' num2str(atom) ' 0 0']; + case 2 + data2print = [' ' num2str(atom) ' 0 0']; + case 3 + data2print = [num2str(atom) ' 0 0']; + end + rxnFile{begMol(j) + 4 + k} = [rxnFile{begMol(j) + 4 + k}(1:60) data2print]; + rxnFile{begMol(productIdx(1) + numel(metSubs)) + 4 + k} = [rxnFile{begMol(productIdx(1) + numel(metSubs)) + 4 + k}(1:60) data2print]; + end + metProds(productIdx(1)) = {'done'}; + end + + % Write the file + fid2 = fopen([outputDir fnames(i).name], 'w'); + fprintf(fid2, '%s\n', rxnFile{:}); + fclose(fid2); + + c = c + 1; + mappedRxns{c} = regexprep(fnames(i).name, '.rxn', ''); + clear metSubs metProds + + end +end + +if ~exist('mappedRxns', 'var') + mappedRxns = []; +end +end + +function newMetList = expandMets(metList) + +% Check if a metabolite has an number to be expanded +idxsCheck = ~cellfun(@isempty, regexp(metList, ' ')); +if any(idxsCheck) + idx = find(idxsCheck); + % Add repeated metabolites + for i = 1:length(idx) + met2expand = split(metList(idx(i))); + metList = [metList; repelem(met2expand(2), str2double(met2expand(1)))']; + end + metList(idx) = []; +end + +% Create the new list with metabolites sorted and without a compartment +newMetList = metList; +newMetList = sort(regexprep(newMetList, '(\[\w\])', '')); + +end diff --git a/src/reconstruction/comparison/compareModels.m b/src/reconstruction/comparison/compareModels.m new file mode 100644 index 0000000000..d49363e615 --- /dev/null +++ b/src/reconstruction/comparison/compareModels.m @@ -0,0 +1,46 @@ +function [isSameModel,differences] = compareModels(modelA,modelB,printLevel) +%compares modelA with modelB, looking for differences between the +%structures +% +% INPUT +% modelA: structure +% modelB: structure +% printLevel: +% +% OUTPUT +% isSameModel: true if identical models, false otherwise +% differences: structure listing differences between models +% *.reason: gives a text stack of why the difference occurred +% as well as a field +% *.where: contains the indices and subfields of the structure +% where the comparison failed. +% +% +% Note: This function depends on structeq.m and celleq.m + +if~exist('printLevel','var') + printLevel=1; +end + +differences=[]; +result = 0; +i=0; +while result==0 + [result, why] = structeq(modelA,modelB); + if result==0 + i=i+1; + if printLevel>0 + fprintf('%s\n',why.Reason) + fprintf('%s\n',why.Where) + end + differences(i).reason = why.Reason; + differences(i).where = why.Where; + %fieldName = strrep(why.Where,'(1)',''); + fieldName = strtok(why.Where,'(1)'); + eval(['modelB' fieldName ' = modelA' fieldName ';']); + end +end +isSameModel = i==0; + +end + diff --git a/test/verifiedTests/dataIntegration/testChemoInformatics/metaboliteIds.xlsx b/test/verifiedTests/dataIntegration/testChemoInformatics/metaboliteIds.xlsx new file mode 100644 index 0000000000..1b46cd9755 Binary files /dev/null and b/test/verifiedTests/dataIntegration/testChemoInformatics/metaboliteIds.xlsx differ diff --git a/test/verifiedTests/dataIntegration/testChemoInformatics/refData_generateChemicalDatabase.mat b/test/verifiedTests/dataIntegration/testChemoInformatics/refData_generateChemicalDatabase.mat new file mode 100644 index 0000000000..a3e0ee2354 Binary files /dev/null and b/test/verifiedTests/dataIntegration/testChemoInformatics/refData_generateChemicalDatabase.mat differ diff --git a/test/verifiedTests/dataIntegration/testChemoInformatics/testGenerateChemicalDatabase.m b/test/verifiedTests/dataIntegration/testChemoInformatics/testGenerateChemicalDatabase.m new file mode 100644 index 0000000000..5a62115bca --- /dev/null +++ b/test/verifiedTests/dataIntegration/testChemoInformatics/testGenerateChemicalDatabase.m @@ -0,0 +1,64 @@ +% The COBRAToolbox: testGenerateChemicalDatabase.m +% +% Purpose: +% - tests the generateChemicalDatabase function using the Citric Acid +% Cycle E. coli Core Model +% + +fprintf(' Testing generateChemicalDatabase ... \n' ); + +% Save the current path +currentDir = pwd; +mkdir([currentDir filesep 'tmpDB']) + +% Check external software +[cxcalcInstalled, ~] = system('cxcalc'); +cxcalcInstalled = ~cxcalcInstalled; +[oBabelInstalled, ~] = system('obabel'); +[javaInstalled, ~] = system('java'); + +% Load the E. coli Core Model TCA rxns +load ecoli_core_model.mat +model.mets = regexprep(model.mets, '\-', '\_'); +rxnsToExtract = {'AKGDH', 'CS', 'FUM', 'ICDHyr', 'MDH', 'SUCOAS'}; +model = extractSubNetwork(model, rxnsToExtract); + +% initialize the test +fileDir = fileparts(which('testGenerateChemicalDatabase.m')); + +% Load reference data +load('refData_generateChemicalDatabase.mat') + +%% Add external information in the model + +inputData = [fileDir filesep 'metaboliteIds.xlsx']; +replace = false; +[model, hasEffect] = addMetInfoInCBmodel(model, inputData, replace); + +assert(isequal(model, model1), 'The identifiers are different') +assert(hasEffect, 'The function addMetInfoInCBmodel din not have effect') + +%% Set optional variables according the software installed + +options.outputDir = [currentDir filesep 'tmpDB']; +options.printlevel = 0; + +if cxcalcInstalled && oBabelInstalled && javaInstalled + options.adjustToModelpH = true; + options.onlyUnmapped = false; +else + options.adjustToModelpH = false; + options.onlyUnmapped = true; +end + +[info, model] = generateChemicalDatabase(model, options); + +if cxcalcInstalled && oBabelInstalled && javaInstalled + assert(isequal(model, model2), 'The model ouput is different') + assert(isequal(info, info1), 'The database report is different') +else + assert(isequal(model, model3), 'The model ouput is different') + assert(isequal(info, info2), 'The database report is different') +end + +rmdir([currentDir filesep 'tmpDB'], 's') \ No newline at end of file