diff --git a/src/parse/reactomeData.py b/src/parse/reactomeData.py index 219b25e..0a7d053 100755 --- a/src/parse/reactomeData.py +++ b/src/parse/reactomeData.py @@ -389,11 +389,16 @@ def getCommonNameFromUniprot(self): files = os.listdir("../misc/data/Uniprot/") path = "../misc/data/Uniprot/" i = 0 + haveMapping = False + haveGeneSymbol = False #print('Parsing UniProt files ...') for f in files: i = i + 1 #if i % 1000 == 0: #print('Processing {} files'.format(i)) + + haveMapping = False + haveGeneSymbol = False try: tree = ET.parse(path + f) geneid = f.replace(".xml","") @@ -410,31 +415,43 @@ def getCommonNameFromUniprot(self): #print(geneid+":"+name.text) try: mapping = self.geneInfoDictionary['uniprot:'+geneid] + haveMapping = True mapping["common_name"] = "gene_symbol:"+name.text + haveGeneSymbol = True except KeyError: pass + print("Key Error for "+geneid+" in file "+f) + # print("Raw data does not have this ID ...") # print(geneid) # we now have uniprot to 'common_name', really gene id. # now we want to grab the NCBI/Entrez 'GeneID' -# if childtag == "dbReference": -# if child2.get("type") == "GeneID": -# geneId = child2.get("id") -# geneId = 'entrez:'+geneId + if childtag == "dbReference": + if child2.get("type") == "GeneID": + + if not haveMapping: + print("Hey we are adding a gene id but don't have new mapping. Uniprot:"+f) + # we don't have the mapping for the protein based from above... + # jump to next child... eventually next file. + continue + + geneId = child2.get("id") + geneId = 'entrez:'+geneId # protein to gene can be 1:n, so they have to be stored as a list # lets check for a value -# idList = mapping.get("small_e_entrez", None) -# if(idList == None): -# idList = list() -# mapping["small_e_entrez"] = idList + idList = mapping.get("small_e_entrez", None) + if(idList == None): + idList = list() + mapping["small_e_entrez"] = idList -# idList.append(geneId) + idList.append(geneId) except ET.ParseError: print("Skip {} ...".format(f)) + pass # def checkFiles(self): diff --git a/src/update/RaMPDatabase.py b/src/update/RaMPDatabase.py deleted file mode 100644 index 2e43159..0000000 --- a/src/update/RaMPDatabase.py +++ /dev/null @@ -1,68 +0,0 @@ -import os -import pymysql.cursors -from fileinput import close -import pandas as pd - - -class RaMPDatabase(): - ''' - This class is the super class of all checker, updater class - It contains general functions to aid genral functionality of other classes - - attribute str dbname database name for the mysql - - ''' - def __init__(self): - - self.table_names = [ - "analyte", - "analytehasontology", - "analytehaspathway", - "analytesynonym", - "catalyzed", - "ontology", - "pathway", - "source"] - - - - def check_path(self,dir): - ''' - This fucntion check if this directory exists, otherwise it will create one - - param dict dir: The directory to check or created. - - return: True if the path has been created successfully - ''' - if not os.path.exists(dir): - try: - os.makedirs(dir) # check if the directory exists, create one if not - return True - except OSError as e: # Trap the OS error and show the embedded error code - if e.errno != errno.EEXIST: - raise - - - def connectToRaMP(self, host= "localhost", user = "root" - ,password = "Ehe131224",dbname = "mathelabramp"): - ''' - Connect to local RaMP database by MySQL - - param str host host name for the mysql connection - - param str user username for the mysql conncection - - param str dbname database name for connection if None: connect to the database page - instead of table page - - param str password the password you used for you computer's mysql database - ''' - if dbname is not None: - conn = pymysql.connect(host = host, - user= user, - password = password, - db = dbname, - charset = "utf8mb4", - cursorclass = pymysql.cursors.DictCursor) - else: - conn = pymysql.connect(host = host, - user= user, - password = password, - charset = "utf8mb4", - cursorclass = pymysql.cursors.DictCursor) - return conn - diff --git a/src/update/RaMPFixer.py b/src/update/RaMPFixer.py deleted file mode 100644 index 2747155..0000000 --- a/src/update/RaMPFixer.py +++ /dev/null @@ -1,27 +0,0 @@ -from update.RaMPDatabase import RaMPDatabase -import pymysql.err -from builtins import str -import pandas as pd -import numpy as np -from schema import session,Source,Analyte,Pathway,Analytehasontology,\ -Analytehaspathway,Analytesynonym,Ontology,Catalyzed - -class RaMPFixer(RaMPDatabase): - ''' - This class simulates the function of the C# code in RaMP - Get the original RaMP data from MySQL, then do the following things: - 1) For each table, find the missing data cell, and remove the entire row since the map - is not successful. (Some columns are deliberated to left empty with 'NA'.) - 2) Remove special character such as ' ', and wrong data in the cell. Consider drop entire - row based on conditions - 3) Remapping corrected RaMP ID relations based on corrected data. - 4) Create and import new RaMP data to the database. - ''' - - def __init__(self): - super().__init__() - - self.new_tables = dict() - - - diff --git a/src/update/__pycache__/RaMPDatabase.cpython-35.pyc b/src/update/__pycache__/RaMPDatabase.cpython-35.pyc deleted file mode 100644 index 24e39f9..0000000 Binary files a/src/update/__pycache__/RaMPDatabase.cpython-35.pyc and /dev/null differ diff --git a/src/update/__pycache__/RaMPFixer.cpython-35.pyc b/src/update/__pycache__/RaMPFixer.cpython-35.pyc deleted file mode 100644 index 7724f65..0000000 Binary files a/src/update/__pycache__/RaMPFixer.cpython-35.pyc and /dev/null differ diff --git a/src/util/SQLiteDBBulkLoader.py b/src/util/SQLiteDBBulkLoader.py index 84c8ce9..ae3333e 100644 --- a/src/util/SQLiteDBBulkLoader.py +++ b/src/util/SQLiteDBBulkLoader.py @@ -156,7 +156,6 @@ def loadFile(self, resource, engine): df = df.drop_duplicates(ignore_index=False, inplace=False, keep='first') print(str(df.shape)) - print(df.head(n=5)) table = resource.destTable # this loads the data frame into the table. try: @@ -834,7 +833,7 @@ def updateSourcePathwayCount(self): # conn.execute(sql) # conn.close() - sql = "select ap.rampId, count(distinct(ap.pathwayRampId)) as pathwayCount from analytehaspathway ap "\ + sql = "select count(distinct(ap.pathwayRampId)) as pathwayCount, ap.rampId from analytehaspathway ap "\ "where ap.pathwaySource != 'hmdb' group by ap.rampId" sql2 = "update source set pathwayCount = :pathwayCount where rampId = :rampId" @@ -842,11 +841,20 @@ def updateSourcePathwayCount(self): with self.engine.connect() as conn: df = conn.execute(sql).all() df = pd.DataFrame(df) - df.columns = ["rampId", "pathwayCount"] - + df.columns = ["pathwayCount", "rampId"] + + print("setting pw count... shape=") + print(df.shape) + print(df.head(10)) + + k = 0 for i,row in df.iterrows(): + k = k + 1 + if k < 10: + print(row) + print("\n") conn.execute(sql2, row) - + conn.close() print("Finished: updating pathway counts in source table")