From 8332082652566c55ef2b7e0a304c2eb2925a8a33 Mon Sep 17 00:00:00 2001 From: Thomas Faria <57811152+ThomasFaria@users.noreply.github.com> Date: Tue, 10 Dec 2024 17:07:44 +0100 Subject: [PATCH] fix data loading --- src/train.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/train.py b/src/train.py index dacb181..57ff660 100644 --- a/src/train.py +++ b/src/train.py @@ -6,7 +6,6 @@ import fasttext import mlflow import pandas as pd -import pyarrow.parquet as pq from sklearn.model_selection import train_test_split from preprocessor import Preprocessor from constants import TEXT_FEATURE, Y, DATA_PATH, LABEL_PREFIX @@ -18,11 +17,7 @@ def load_data(): """ Load data for training and test. """ - fs = s3fs.S3FileSystem( - client_kwargs={"endpoint_url": "https://minio.lab.sspcloud.fr"}, - anon=True - ) - df = pq.ParquetDataset(DATA_PATH, filesystem=fs).read_pandas().to_pandas() + df = pd.read_parquet(f"https://minio.lab.sspcloud.fr/{DATA_PATH}") return df.sample(frac=0.1)