-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnlp_using_smiles_&_properties.py
73 lines (58 loc) · 2.06 KB
/
nlp_using_smiles_&_properties.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# -*- coding: utf-8 -*-
"""NLP using SMILES & Properties.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1DcQstFDu7avXT6MPt-ihmADKuMCqhLrJ
"""
#importing necessary libraries
import pubchempy as pcp
import nltk
from nltk.tokenize import word_tokenize
#function to extract compounds from PubChem
def extract_compounds(smiles, properties):
#tokenize the input strings
smiles_tokens = word_tokenize(smiles)
properties_tokens = word_tokenize(properties)
#loop through the tokens to find compounds
compounds = []
for token in smiles_tokens:
try:
compound = pcp.get_compounds(token, 'smiles')
compounds.append(compound)
except:
pass
for token in properties_tokens:
try:
compound = pcp.get_compounds(token, 'property')
compounds.append(compound)
except:
pass
#return the list of compounds
return compounds
#example usage
smiles = 'C1CN2CC3=CC=CC=C3N=C2[C@H]1O'
properties = 'molecular_weight>180'
compounds = extract_compounds(smiles, properties)
print(compounds)
import requests
def get_compound_name(smiles):
# Define the PubChem API URL
url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/{}/property/IUPACName,MolecularWeight,CanonicalSMILES,InChIKey/JSON".format(smiles)
# Send a request to the API and get the response
response = requests.get(url)
# Check if the request was successful
if response.status_code == 200:
# Extract the name of the compound from the response
data = response.json()
compound_name = data['PropertyTable']['Properties'][0]['IUPACName']
return compound_name
else:
# Return an error message if the request was not successful
return "Error: Unable to fetch data from PubChem"
# Example usage
smiles = "CC(=O)Oc1ccccc1C(=O)O"
compound_name = get_compound_name(smiles)
print("Compound name:", compound_name)
!pip install pubchempy
import nltk
nltk.download('punkt')