Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
ryukaizen committed Aug 9, 2024
0 parents commit 1490b84
Show file tree
Hide file tree
Showing 49 changed files with 23,009 additions and 0 deletions.
32 changes: 32 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# dependencies
/node_modules
/.pnp
.pnp.js

__pycache__

# testing
/coverage

# production
/build

# misc
.DS_Store
.env.local
.env.development.local
.env.test.local
.env.production.local

npm-debug.log*
yarn-debug.log*
yarn-error.log*

test

.rasa
.venv
venv

.tar.gz
.pyc
Empty file added marai/actions/__init__.py
Empty file.
75 changes: 75 additions & 0 deletions marai/actions/actions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
from typing import Any, Text, Dict, List
from rasa_sdk import Action, Tracker
from rasa_sdk.executor import CollectingDispatcher
from .tfidf.retriever import get_response
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

class MBartParaphraser:
def __init__(self):
self.tokenizer = AutoTokenizer.from_pretrained("ai4bharat/MultiIndicParaphraseGeneration", do_lower_case=False, use_fast=False, keep_accents=True)
self.model = AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/MultiIndicParaphraseGeneration")
self.bos_id = self.tokenizer._convert_token_to_id_with_added_voc("<s>")
self.eos_id = self.tokenizer._convert_token_to_id_with_added_voc("</s>")
self.pad_id = self.tokenizer._convert_token_to_id_with_added_voc("<pad>")
self.lang_id = "<2mr>"

def rephrase(self, message):
inp = self.tokenizer(message + " </s> " + self.lang_id, add_special_tokens=False, return_tensors="pt", padding=True).input_ids
model_output = self.model.generate(
inp,
use_cache=True,
no_repeat_ngram_size=2,
encoder_no_repeat_ngram_size=2,
num_beams=2,
max_length=30,
min_length=10,
early_stopping=True,
pad_token_id=self.pad_id,
bos_token_id=self.bos_id,
eos_token_id=self.eos_id,
decoder_start_token_id=self.tokenizer._convert_token_to_id_with_added_voc(self.lang_id)
)
return self.tokenizer.decode(model_output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)

paraphraser = MBartParaphraser()

class ActionRephraseResponse(Action):
def name(self) -> Text:
return "action_rephrase_response"

def run(
self,
dispatcher: CollectingDispatcher,
tracker: Tracker,
domain: Dict[Text, Any],
) -> List[Dict[Text, Any]]:
latest_intent = tracker.latest_message['intent'].get('name')
utterance_name = f"utter_{latest_intent}"
utterance = domain['responses'].get(utterance_name)

if utterance:
original_text = utterance[0]['text']

rephrased_text = paraphraser.rephrase(original_text)

dispatcher.utter_message(text=rephrased_text)
else:
dispatcher.utter_message(text="माफ करा, मी समजू शकलो नाही.")

return []

class ActionDefaultFallback(Action):
def name(self) -> str:
return "action_default_fallback"

def run(self, dispatcher: CollectingDispatcher,
tracker: Tracker,
domain: dict) -> list:
latest_message = tracker.latest_message
intent = latest_message['intent']['name']
confidence = latest_message['intent']['confidence']
print(f"\n\nIntent: {intent}, Confidence: {confidence}\n\n")
user_message = tracker.latest_message.get('text', '')
response = get_response(user_message)
dispatcher.utter_message(text=response)
return []
65 changes: 65 additions & 0 deletions marai/actions/inference_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import sys
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# created this just to test and experiment with parameters, not a part of prod

class MBartParaphraser:
def __init__(self):
self.tokenizer = AutoTokenizer.from_pretrained("ai4bharat/MultiIndicParaphraseGeneration", do_lower_case=False, use_fast=False, keep_accents=True)
self.model = AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/MultiIndicParaphraseGeneration")
self.bos_id = self.tokenizer._convert_token_to_id_with_added_voc("<s>")
self.eos_id = self.tokenizer._convert_token_to_id_with_added_voc("</s>")
self.pad_id = self.tokenizer._convert_token_to_id_with_added_voc("<pad>")
self.lang_id = "<2mr>"

def rephrase(self, message):
inp = self.tokenizer(message + " </s> " + self.lang_id, add_special_tokens=False, return_tensors="pt", padding=True).input_ids
model_output = self.model.generate(
inp,
use_cache=True, # enables caching of key/value pairs for faster decoding
no_repeat_ngram_size=2, # prevents repetition of 2-gram phrases in the output
encoder_no_repeat_ngram_size=2, # prevents repetition of 2-gram phrases in the encoder
num_beams=2, # number of beams for beam search. Higher values = more diverse outputs but slower
max_length=30, # maximum length of the generated sequence
min_length=10, # minimum length of the generated sequence
early_stopping=True, # stops generation when all beam hypotheses reach the EOS token
pad_token_id=self.pad_id, # ID of the padding token
bos_token_id=self.bos_id, # ID of the beginning-of-sequence token
eos_token_id=self.eos_id, # ID of the end-of-sequence token
)
return self.tokenizer.decode(model_output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)

def main():
paraphraser = MBartParaphraser()

print("Welcome to the Marathi Paraphraser!")
print("Enter Marathi text to paraphrase. Type 'exit' to quit.")

while True:
user_input = input("\nEnter Marathi text: ")

if user_input.lower() == 'exit':
print("Goodbye!")
break

try:
paraphrased_text = paraphraser.rephrase(user_input)
print(f"\nOriginal text: {user_input}")
print(f"Paraphrased text: {paraphrased_text}")
except Exception as e:
print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
main()


# **** On current parameters ****
# Welcome to the Marathi Paraphraser!
# Enter Marathi text to paraphrase. Type 'exit' to quit.

# Enter Marathi text: माफ करा मी समजू शकलो नाही

# Original text: माफ करा मी समजू शकलो नाही
# Paraphrased text: मला समजत नाही मला माफ कर..

# Enter Marathi text:
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
नरेंद्र दामोदरदास मोदी (जन्म : १७ सप्टेंबर १९५०) हे भारतीय जनता पक्षाचे (भाजपचे) नेते आणि मे २६, इ।स।२०१४ पासून स्वतंत्र भारताचे १५ वे पंतप्रधान आहेत।ते ऑक्टोबर ७, इ।स।२००१ पासून मे २२, इ।स।२०१४ पर्यंत गुजरात राज्याचे मुख्यमंत्री होते।स्वातंत्र्यप्राप्तीनंतर जन्मलेले ते भारताचे पहिले पंतप्रधान आहेत।भाजपच्या गुजरात विधानसभेच्या २००२ ते २०१२ च्या तसेच १९९५ च्या व १९९८ च्या निवडणूक विजयांमध्ये मोदींचे मोठे योगदान होते।ते २००१ च्या ऑक्टोबर मध्ये गुजरातचे मुख्यमंत्री झाले व त्यानंतर सरळ ४ विधानसभा जिंकत मोदींनी मुख्यमंत्रिपदाचा कार्यभार पाहिला।२००९ लोकसभा निवडणुकीसाठीही ते भाजपचे कूटनीतिज्ञ होते। मोदी हे हिंदुत्ववादी विचारसरणीचे नेते आहेत व संघाचे स्वयंसेवक आहेत।मोदी हे गुजरात राज्याच्या विकासासाठी ओळखले जातात।त्यांच्या अर्थकारणाची प्रशंसा सर्वत्र केली जाते।परंतु २००२ च्या दंगलीत त्यांच्या भूमिकेबद्दल आक्षेप घेतले गेले
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
पंचमहाभूते सांख्यदर्शनानुसार सर्व भौतिक जग (दृश्य विश्व), सजीव व निर्जीव पदार्थ, आपले शरीर हे पाच मूळ तत्त्वांपासून (द्रव्यांपासून) तयार झाले आहे असे मानले जाते.
या पाच तत्त्वांना पंचमहाभूते असे म्हणतात.
हिंदू धर्मातील तत्त्वज्ञानावर प्राचीन सांख्यदर्शन या शास्त्राचा मोठा प्रभाव आहे.
ही मूळतत्त्वे खालील प्रमाणे आहेत.
यातील प्रथम तत्त्व अग्नि येते
माती, दगड व त्यापासून निर्माण झालेले सर्व जड, कठीण, घन पदार्थ.
पाणी, वाफ, ढग व सर्व द्रव, ओले, मृदू पदार्थ.
ऊर्जा: अग्नी (क्षेपणास्त्र), किरणे, प्रारणे (Radiation), उष्णता, वीज, प्रकाश या स्वरूपात.
हवा, चैतन्य, हालचाल, चलनवलन, तरलता आणि वेग.
अवकाश, आकाश व पोकळी.

या पंचमहाभूतांची देवस्थाने पुढीलप्रमाणे-
१.पृथ्वी- कांचिवरम
२.आप- जम्बुकेश्वर
३.तेज- अरुणाचल
४.वायू- कालहस्ती
५.आकाश- चिदंबरम
Loading

0 comments on commit 1490b84

Please sign in to comment.