Skip to content

Commit

Permalink
using Logistic Regression and similar to the example provided
Browse files Browse the repository at this point in the history
  • Loading branch information
im4li committed May 1, 2024
1 parent 5df65e9 commit 5509e5c
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 0 deletions.
1 change: 1 addition & 0 deletions authorship-verification-submission/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
predictions.jsonl
5 changes: 5 additions & 0 deletions authorship-verification-submission/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
FROM fschlatt/natural-language-processing-exercises:0.0.1

ADD authorship_verification_submission.py /code/authorship_verification_submission.py

ENTRYPOINT [ "python3", "/code/authorship_verification_submission.py" ]
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from pathlib import Path

from tira.rest_api_client import Client
from tira.third_party_integrations import get_output_directory

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline




def evaluate_model(model, data, labels):
predictions = model.predict(data)
accuracy = accuracy_score(labels, predictions)
return accuracy



if __name__ == "__main__":
tira = Client()

# loading train data
text_train = tira.pd.inputs(
"nlpbuw-fsu-sose-24", "authorship-verification-train-20240408-training"
)
targets_train = tira.pd.truths(
"nlpbuw-fsu-sose-24", "authorship-verification-train-20240408-training"
)
# loading validation data (automatically replaced by test data when run on tira)
text_validation = tira.pd.inputs(
"nlpbuw-fsu-sose-24", "authorship-verification-validation-20240408-training"
)
targets_validation = tira.pd.truths(
"nlpbuw-fsu-sose-24", "authorship-verification-validation-20240408-training"
)

tfidf_vectorizer = TfidfVectorizer(max_features=1000)

# Model Training
model = Pipeline([
('vectorizer', tfidf_vectorizer),
('classifier', LogisticRegression())
])

model.fit(text_train['text'], targets_train['generated'])


val_accuracy = evaluate_model(model, text_validation['text'], targets_validation['generated'])
# print("Validation Accuracy:", val_accuracy)

# make predictions
predictions = model.predict(text_validation["text"])
text_validation["generated"] = predictions
df = text_validation[["id", "generated"]]

# Save the predictions
output_directory = get_output_directory(str(Path(__file__).parent))
df.to_json(
Path(output_directory) / "predictions.jsonl", orient="records", lines=True
)

0 comments on commit 5509e5c

Please sign in to comment.