From 5509e5c90f50c52bc42122cc5bcee66f4b7dade7 Mon Sep 17 00:00:00 2001 From: Ali Al Jasim <103039350+im4li@users.noreply.github.com> Date: Wed, 1 May 2024 14:09:41 +0000 Subject: [PATCH 1/2] using Logistic Regression and similar to the example provided --- .../.dockerignore | 1 + authorship-verification-submission/Dockerfile | 5 ++ .../authorship_verification_submission.py | 62 +++++++++++++++++++ 3 files changed, 68 insertions(+) create mode 100644 authorship-verification-submission/.dockerignore create mode 100644 authorship-verification-submission/Dockerfile create mode 100644 authorship-verification-submission/authorship_verification_submission.py diff --git a/authorship-verification-submission/.dockerignore b/authorship-verification-submission/.dockerignore new file mode 100644 index 0000000..8c785a3 --- /dev/null +++ b/authorship-verification-submission/.dockerignore @@ -0,0 +1 @@ +predictions.jsonl \ No newline at end of file diff --git a/authorship-verification-submission/Dockerfile b/authorship-verification-submission/Dockerfile new file mode 100644 index 0000000..1130b34 --- /dev/null +++ b/authorship-verification-submission/Dockerfile @@ -0,0 +1,5 @@ +FROM fschlatt/natural-language-processing-exercises:0.0.1 + +ADD authorship_verification_submission.py /code/authorship_verification_submission.py + +ENTRYPOINT [ "python3", "/code/authorship_verification_submission.py" ] \ No newline at end of file diff --git a/authorship-verification-submission/authorship_verification_submission.py b/authorship-verification-submission/authorship_verification_submission.py new file mode 100644 index 0000000..2c51851 --- /dev/null +++ b/authorship-verification-submission/authorship_verification_submission.py @@ -0,0 +1,62 @@ +from pathlib import Path + +from tira.rest_api_client import Client +from tira.third_party_integrations import get_output_directory + +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import accuracy_score +from sklearn.pipeline import Pipeline + + + + +def evaluate_model(model, data, labels): + predictions = model.predict(data) + accuracy = accuracy_score(labels, predictions) + return accuracy + + + +if __name__ == "__main__": + tira = Client() + + # loading train data + text_train = tira.pd.inputs( + "nlpbuw-fsu-sose-24", "authorship-verification-train-20240408-training" + ) + targets_train = tira.pd.truths( + "nlpbuw-fsu-sose-24", "authorship-verification-train-20240408-training" + ) + # loading validation data (automatically replaced by test data when run on tira) + text_validation = tira.pd.inputs( + "nlpbuw-fsu-sose-24", "authorship-verification-validation-20240408-training" + ) + targets_validation = tira.pd.truths( + "nlpbuw-fsu-sose-24", "authorship-verification-validation-20240408-training" + ) + + tfidf_vectorizer = TfidfVectorizer(max_features=1000) + + # Model Training + model = Pipeline([ + ('vectorizer', tfidf_vectorizer), + ('classifier', LogisticRegression()) + ]) + + model.fit(text_train['text'], targets_train['generated']) + + + val_accuracy = evaluate_model(model, text_validation['text'], targets_validation['generated']) + # print("Validation Accuracy:", val_accuracy) + + # make predictions + predictions = model.predict(text_validation["text"]) + text_validation["generated"] = predictions + df = text_validation[["id", "generated"]] + + # Save the predictions + output_directory = get_output_directory(str(Path(__file__).parent)) + df.to_json( + Path(output_directory) / "predictions.jsonl", orient="records", lines=True + ) \ No newline at end of file From 063a124fa94c38cefe0016e038864902009d111c Mon Sep 17 00:00:00 2001 From: Ali Al Jasim <103039350+im4li@users.noreply.github.com> Date: Wed, 1 May 2024 14:22:59 +0000 Subject: [PATCH 2/2] problem with output, trying to remove docker ignore --- authorship-verification-submission/.dockerignore | 1 - 1 file changed, 1 deletion(-) diff --git a/authorship-verification-submission/.dockerignore b/authorship-verification-submission/.dockerignore index 8c785a3..e69de29 100644 --- a/authorship-verification-submission/.dockerignore +++ b/authorship-verification-submission/.dockerignore @@ -1 +0,0 @@ -predictions.jsonl \ No newline at end of file