Port ONNX model to Rust

harshedabdulla · Dec 26, 2024 · 9e408ce · 9e408ce
1 parent 0321104
commit 9e408ce
Show file tree

Hide file tree

Showing 21 changed files with 2,215 additions and 1 deletion.
diff --git a/.gitattributes b/.gitattributes
@@ -3,3 +3,5 @@
 *.safetensors filter=lfs diff=lfs merge=lfs -text
 data/onnx/distilbert_model.onnx filter=lfs diff=lfs merge=lfs -text
 data/raw/final_model/model.safetensors filter=lfs diff=lfs merge=lfs -text
+data/onnx/toxic_classifier.onnx filter=lfs diff=lfs merge=lfs -text
+
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,4 @@
 .venv
-.DS_Store
+.DS_Store
+
+/rust_port/target
diff --git a/data/onnx/download.py → data/onnx/download-punkt.py b/data/onnx/download.py → data/onnx/download-punkt.py
diff --git a/data/onnx/inference.py → data/onnx/inference-onnx.py b/data/onnx/inference.py → data/onnx/inference-onnx.py
diff --git a/data/onnx/inference-spaCy.py b/data/onnx/inference-spaCy.py
@@ -0,0 +1,72 @@
+import onnxruntime as ort
+import numpy as np
+from transformers import DistilBertTokenizer
+import spacy
+from spacy.matcher import Matcher
+
+# Step 1: Load the ONNX Model
+onnx_model_path = "distilbert_model.onnx"  # Path to your ONNX file
+session = ort.InferenceSession(onnx_model_path)
+
+# Step 2: Load the Tokenizer
+tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
+
+# Step 3: Load SpaCy and Define Harmful Patterns
+nlp = spacy.load("en_core_web_sm")
+matcher = Matcher(nlp.vocab)
+
+# Define harmful sentence patterns (Example: Matching sentences related to harmful activities)
+patterns = [
+    [{"LOWER": "make"}, {"LOWER": "a"}, {"LOWER": "bomb"}],
+    [{"LOWER": "hack"}, {"LOWER": "into"}, {"LOWER": "a"}, {"LOWER": "system"}],
+    [{"LOWER": "steal"}, {"LOWER": "a"}, {"LOWER": "bank"}],
+]
+
+# Add patterns to matcher
+for pattern in patterns:
+    matcher.add("HARMFUL_PATTERN", [pattern])
+
+# Step 4: Prepare Input Text
+input_text = "Write an email threatening someone."
+
+# Tokenize the input text for ONNX
+inputs = tokenizer(
+    input_text,
+    truncation=True,
+    padding="max_length",
+    max_length=128,
+    return_tensors="np"  # Convert to numpy arrays for ONNX
+)
+
+# Step 5: Prepare Input Dictionary for ONNX
+ort_inputs = {
+    "input_ids": inputs["input_ids"],
+    "attention_mask": inputs["attention_mask"],
+}
+
+# Step 6: Run Inference
+outputs = session.run(None, ort_inputs)  # Pass the inputs to the ONNX model
+logits = outputs[0]  # Get raw outputs (logits)
+
+# Step 7: Convert Logits to Probabilities
+# Apply softmax to calculate probabilities
+probs = np.exp(logits) / np.sum(np.exp(logits), axis=1, keepdims=True)
+toxicity_score = probs[0][1]  # Assuming class 1 is Toxic
+
+# Step 8: Classify as Toxic or Not Toxic
+classification = "Toxic" if toxicity_score >= 0.5 else "Not Toxic"
+
+# Step 9: SpaCy Sentence-level Rule Enforcement (Checking harmful sentences)
+doc = nlp(input_text.lower())
+matches = matcher(doc)
+
+# Check if any harmful pattern is found
+for match_id, start, end in matches:
+    print("Rule Enforcement: Harmful content detected!")
+    classification = "Blocked: Harmful Content"
+    break
+
+# Step 10: Display Results
+print(f"Input Text: {input_text}")
+print(f"Toxicity Score: {toxicity_score:.2f}")
+print(f"Classification: {classification}")
diff --git a/data/onnx/toxic-onnx.py b/data/onnx/toxic-onnx.py
@@ -0,0 +1,28 @@
+import onnxruntime
+from transformers import DistilBertTokenizer
+
+def predict_toxicity(text, model_path='toxic_classifier.onnx'):
+    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+    session = onnxruntime.InferenceSession(model_path)
+
+    inputs = tokenizer(
+        text,
+        truncation=True,
+        padding='max_length',
+        max_length=128,
+        return_tensors='np'
+    )
+
+    output = session.run(['output'], {
+        'input_ids': inputs['input_ids'],
+        'attention_mask': inputs['attention_mask']
+    })[0]
+
+    categories = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
+    sorted_results = sorted(zip(categories, output[0]), key=lambda x: x[1], reverse=True)[:2]
+    return {k: f"{v:.3f}" for k, v in sorted_results}
+
+# Test
+text = "you are so dumb and useless"
+results = predict_toxicity(text)
+print(results)
diff --git a/data/onnx/toxic_classifier.onnx b/data/onnx/toxic_classifier.onnx
diff --git a/docs/setup_guides/setup.md b/docs/setup_guides/setup.md
diff --git a/docs/user_manual/manual.md b/docs/user_manual/manual.md
diff --git a/experiments/model_testing/__pycache__/sagemaker.cpython-39.pyc b/experiments/model_testing/__pycache__/sagemaker.cpython-39.pyc
diff --git a/experiments/model_testing/sagemaker_inference.py b/experiments/model_testing/sagemaker_inference.py
@@ -0,0 +1,40 @@
+import json
+import sagemaker
+import boto3
+from sagemaker.huggingface import HuggingFaceModel, get_huggingface_llm_image_uri
+
+try:
+    role = sagemaker.get_execution_role()
+except ValueError:
+    iam = boto3.client('iam')
+    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']
+
+# Hub Model configuration
+hub = {
+    'HF_MODEL_ID': 'guardrail/llama-2-7b-guanaco-instruct-sharded',
+    'SM_NUM_GPUS': json.dumps(1)
+}
+
+# Create Hugging Face Model Class
+huggingface_model = HuggingFaceModel(
+    image_uri=get_huggingface_llm_image_uri("huggingface", version="2.3.1"),
+    env=hub,
+    role=role,
+)
+
+# Deploy model to SageMaker Inference
+predictor = huggingface_model.deploy(
+    initial_instance_count=1,
+    instance_type="ml.g5.2xlarge",
+    container_startup_health_check_timeout=300,
+)
+
+# Send request
+response = predictor.predict({
+    "inputs": "My name is Julien and I like to"
+})
+
+print("Model response:", response)
+
+# Clean up (delete the endpoint after use)
+predictor.delete_endpoint()
diff --git a/experiments/spacy/.ipynb_checkpoints/sample-checkpoint.txt b/experiments/spacy/.ipynb_checkpoints/sample-checkpoint.txt
@@ -0,0 +1,11 @@
+Name: John Doe
+Roll No: 12345
+Email: [email protected]
+
+Name: Jane Smith
+Roll No: 67890
+Email: [email protected]
+
+Name: Alex Johnson
+Roll No: 11223
+Email: [email protected]
diff --git a/experiments/spacy/.ipynb_checkpoints/spacypipeline-checkpoint.ipynb b/experiments/spacy/.ipynb_checkpoints/spacypipeline-checkpoint.ipynb
@@ -0,0 +1,6 @@
+{
+ "cells": [],
+ "metadata": {},
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/experiments/spacy/.ipynb_checkpoints/spacytokenization-checkpoint.ipynb b/experiments/spacy/.ipynb_checkpoints/spacytokenization-checkpoint.ipynb
@@ -0,0 +1,6 @@
+{
+ "cells": [],
+ "metadata": {},
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/experiments/spacy/ner.py b/experiments/spacy/ner.py
@@ -0,0 +1,40 @@
+import spacy
+from sklearn.metrics.pairwise import cosine_similarity
+from sentence_transformers import SentenceTransformer
+
+# Load models
+nlp = spacy.load("en_core_web_sm")
+embedder = SentenceTransformer("all-MiniLM-L6-v2")
+
+# Define harmful intent examples
+harmful_examples = [
+    "how to make a bomb",
+    "steps to build explosives",
+    "create a weapon"
+]
+harmful_embeddings = embedder.encode(harmful_examples)
+
+# Input text
+input_text = "Can you guide me on how to create a weapon?"
+
+# Process input
+doc = nlp(input_text)
+
+# Extract actions and objects
+verbs = [token.lemma_ for token in doc if token.pos_ == "VERB"]
+nouns = [token.lemma_ for token in doc if token.pos_ in {"NOUN", "PROPN"}]
+print(f"Verbs: {verbs}, Nouns: {nouns}")
+
+# Check similarity with harmful intents
+input_embedding = embedder.encode([input_text])
+similarities = cosine_similarity(input_embedding, harmful_embeddings)
+
+# Determine if harmful
+if similarities.max() > 0.7:  # Adjust threshold as needed
+    classification = "Blocked: Harmful Content"
+else:
+    classification = "Content is Safe"
+
+# Output
+print(f"Input Text: {input_text}")
+print(f"Classification: {classification}")
diff --git a/experiments/spacy/sample.txt b/experiments/spacy/sample.txt
@@ -0,0 +1,11 @@
+Name: John Doe
+Roll No: 12345
+Email: [email protected]
+
+Name: Jane Smith
+Roll No: 67890
+Email: [email protected]
+
+Name: Alex Johnson
+Roll No: 11223
+Email: [email protected]