nlq_to_sql_demo.py

# -*- coding: utf-8 -*-
"""NLQ_to_SQL_Demo.py

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1OPYhIeMfVKotFOkf1ZlWwFMIEStGTn7v
"""

# Import Libraries*

import torch.nn.functional as F
import torch
from transformers import AutoTokenizer, AutoModel
import sqlite3
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sklearn.metrics.pairwise import cosine_similarity

# Load Model

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained(
    'sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

# Load the model and tokenizer
tokenizer_decoder = AutoTokenizer.from_pretrained('tscholak/2jrayxos')
model_decoder = AutoModelForSeq2SeqLM.from_pretrained("tscholak/2jrayxos")

# Function for Meanpooling


# Mean Pooling - Take attention mask into account for correct averaging

def mean_pooling(model_output, attention_mask):
    # First element of model_output contains all token embeddings
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(
        -1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Function to get Semantic Similarity


def encoder_decoder_1(query_sentence, table_names, tokenizer, model, cursor):
    # Tokenize query sentence, table names
    query_sentence_encoded = tokenizer(
        [query_sentence], padding=True, truncation=True, return_tensors='pt')
    table_names_encoded = tokenizer(
        table_names, padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings for query sentence, table names
    with torch.no_grad():
        query_sentence_output = model(**query_sentence_encoded)
        table_names_output = model(**table_names_encoded)

    # Perform pooling for query sentence, table names
    query_sentence_embedding = mean_pooling(
        query_sentence_output, query_sentence_encoded['attention_mask'])
    table_names_embeddings = mean_pooling(
        table_names_output, table_names_encoded['attention_mask'])

    # Normalize embeddings for query sentence, table names
    query_sentence_embedding = F.normalize(
        query_sentence_embedding, p=2, dim=1)
    table_names_embeddings = F.normalize(table_names_embeddings, p=2, dim=1)

    # Find the most similar table names by computing the cosine similarity between
    cosine_similarities_tables = torch.nn.functional.cosine_similarity(
        query_sentence_embedding, table_names_embeddings, dim=1)
    most_similar_table_names_indices = cosine_similarities_tables.argsort(
        descending=True)
    most_similar_table_names = [table_names[i]
                                for i in most_similar_table_names_indices]

    # Find the index of the highest matching table name by finding the maximum value
    max_similarity_table_index = cosine_similarities_tables.argmax()

    # Get the highest matching table name by using the index obtained above
    highest_matching_table_name = table_names[max_similarity_table_index]

    # Find the column names of the highest matching table by querying the database
    cursor.execute(f"PRAGMA table_info({highest_matching_table_name});")
    highest_matching_table_column_names = [
        column_info[1] for column_info in cursor.fetchall()]

    highest_matching_table_column_names = ", ".join(
        highest_matching_table_column_names)

    highest_matching_table_column_names = list(
        highest_matching_table_column_names.split(", "))

    highest_matching_table_column_names = [column_name.replace(
        ' ', '_') for column_name in highest_matching_table_column_names]

    return highest_matching_table_name, highest_matching_table_column_names


# Function to Generate SQL Query


def encoder_decoder_2(query_sentence, database_name, highest_matching_table_name, highest_matching_table_column_names,
                      tokenizer_decoder, model_decoder):
    # Make input text in this format. input_text = "list name of film released in 2018 and rating more than 6? |
    # Movie: rating, year, title"
    input_text_1 = query_sentence + " | " + database_name + " | " + \
                   highest_matching_table_name + ": " + \
                   str(highest_matching_table_column_names)

    input_ids_1 = tokenizer_decoder.encode(input_text_1, return_tensors='pt')

    # Generate the output
    output_1 = model_decoder.generate(
        input_ids_1, max_length=128, num_beams=4, early_stopping=True)

    # Decode the output
    output_text_1 = tokenizer_decoder.decode(
        output_1[0], skip_special_tokens=True)

    # Output: IMDB | select title from movie where rating > 6 and year =2018

    # split the output into two parts (sql and table name)
    output_text_1 = output_text_1.split("|")
    sql_query = output_text_1[1].strip()

    # return the sql query
    return sql_query


# Funtion to Execute SQL Query


def sql_executor(sql_query, highest_matching_table_column_names, cursor):
    # convert list to lower case
    highest_matching_table_column_names = [
        x.lower() for x in highest_matching_table_column_names]

    # if s3 contains any of the words in lst1 then replace it with double quotes
    for i in highest_matching_table_column_names:
        if i in sql_query:
            sql_query = sql_query.replace(i, '"' + i + '"')

    # replace underscore with space
    sql_query = sql_query.replace("_", " ")
    # replace all single quotes with double quotes
    sql_query = sql_query.replace("'", '"')
    # Print the sql query
    print(sql_query)
    print(" ")

    # Execute the sql
    cursor.execute(sql_query)
    result = cursor.fetchall()

    # print the result
    print(result)


# Main Function


def main():
    # Query sentence
    query_sentence = input("Enter question: ")

    # Connect to database and fetch table names and column names
    conn = sqlite3.connect('/content/actor_database.db')
    cursor = conn.cursor()

    # Get the filename that is connected above
    filename = conn.cursor().execute("PRAGMA database_list;").fetchall()[0][2]

    # filename = '/content/Db-IMDB.db'
    # split the filename to get the database name
    database_name = filename.split('/')[-1].split('.')[0]

    table_names = [table_info[0] for table_info in cursor.execute(
        "SELECT name FROM sqlite_master WHERE type='table';").fetchall()]
    column_names = []
    for table_name in table_names:
        cursor.execute(f"PRAGMA table_info({table_name});")
        column_names.extend([column_info[1]
                             for column_info in cursor.fetchall()])

    highest_matching_table_name, highest_matching_table_column_names = encoder_decoder_1(
        query_sentence, table_names, tokenizer, model, cursor)

    sql_query = encoder_decoder_2(query_sentence, database_name, highest_matching_table_name,
                                  highest_matching_table_column_names, tokenizer_decoder, model_decoder)

    sql_executor(sql_query, highest_matching_table_column_names, cursor)

    # Close database connection
    conn.close()


# Test with Natural Language Query

if __name__ == "__main__":
    main()