-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest.py
89 lines (64 loc) · 2.97 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from dfin.utils.azure_openai import get_completion_4
def extract_query_skeleton(question):
prompt = f"""
Your task is to identify and extract potential column names from a natural language SQL query. Focus on retaining words that could correspond to column names in a database, while discarding common words, aggregate functions, and other irrelevant terms.
For example, in the query "Find the total revenue for each product category", the words "find", "the", "total", "for", "each" are irrelevant or any other word which is a sql function or an operator. The potential column names or key terms are:
revenue, product, category
Now, identify and list the potential column names or key terms from the following query, pick at most 5 terms:
Query: "{question}"
"""
# Make an API call to OpenAI to get the completion
response_text = get_completion_4(prompt)
return response_text.strip().split('\n')
# Example usage
question = "Show me the average number of people in each city where the population is greater than 100,000."
question = "What is the highest eligible free rate for K-12 students in the schools in Alameda County?"
extracted_terms = extract_query_skeleton(question)
print(extracted_terms)
from nltk.tokenize import word_tokenize
import inflect
p = inflect.engine()
import re
def split_camel_snake_case(text):
# Split snake_case
text = re.sub(r'_', ' ', text)
# Split CamelCase
text = re.sub(r'([a-z0-9])([A-Z])', r'\1 \2', text)
return text
def convert_to_singular(word):
singular = p.singular_noun(word)
return singular if singular else word
def clean_up_terms(terms):
cleaned_terms = set()
for term in terms:
# Split camel case and snake case
term = split_camel_snake_case(term)
# Tokenize the term
tokens = word_tokenize(term)
# Filter out any unwanted characters or words, convert to lowercase, and convert to singular form
cleaned_tokens = [convert_to_singular(token.lower()) for token in tokens if token.isalpha()]
cleaned_terms.update(cleaned_tokens)
return list(cleaned_terms)
# # Example usage
# cleaned_terms = clean_up_terms(extracted_terms)
# print(cleaned_terms)
print(clean_up_terms(["DoctorsName"]))
def keyword_match(query_tokens, preprocessed_columns):
matches = {}
for token in query_tokens:
for column_name, column_tokens in preprocessed_columns.items():
if token in column_tokens:
matches[token] = column_name
break # Stop searching once a match is found for this token
return matches
# Example usage
query = "Which doctor is the best?"
extracted_terms = extract_query_skeleton(query)
cleaned_terms = clean_up_terms(extracted_terms)
# Assuming preprocessed_columns is a dictionary of preprocessed column names from your database
preprocessed_columns = {
"doctorsNames": ['doctor', 'name'],
"politics": ['politics']
}
matches = keyword_match(cleaned_terms, preprocessed_columns)
print(matches)