htrc · poojts · Oct 17, 2024 · Oct 18, 2024 · Oct 23, 2024 · Oct 25, 2024
diff --git a/htrc/torchlite/data.py b/htrc/torchlite/data.py
@@ -1,8 +1,18 @@
+from uuid import UUID
+
+import nltk 
 from .converters import torchlite_volume_meta_from_ef
 from .ef.models import Volume
 from .models.dashboard import FilterSettings
+from .models.dashboard import DataCleaningSettings
 from .utils import make_set
+from .config import config
+from nltk.corpus import stopwords
+import os
+import json
+import logging
 
+log = logging.getLogger(config.PROJECT_NAME)
 
 def apply_filters(volumes: list[Volume], filters: FilterSettings) -> list[Volume]:
     filtered_volumes = []
@@ -34,3 +44,57 @@ def apply_filters(volumes: list[Volume], filters: FilterSettings) -> list[Volume
         filtered_volumes.append(volume)
 
     return filtered_volumes
+
+def load_stopwords(dashboard_id: UUID, language: str, directory="stopword_lists"):
+    nltk.download('stopwords')
+    default_languages = ['english', 'german', 'spanish', 'french']
+    if not os.path.exists(directory):
+        os.makedirs(directory)
+    for lang in default_languages:
+        stopword_list = stopwords.words(lang)
+        stopword_file_path = os.path.join(directory, f"{lang}_stopwords.json")
+        with open(stopword_file_path, 'w', encoding='utf-8') as file:
+            json.dump(stopword_list, file, ensure_ascii=False, indent=4)
+
+    stopword_file_path = os.path.join(directory, f"{dashboard_id}_stopwords.json")
+    log.debug(stopword_file_path)
+    log.debug(os.path.exists(stopword_file_path))
+    if lang not in ["English", "German", "French", "Spanish"]:
+        if (os.path.exists(stopword_file_path)):
+            with open(stopword_file_path, 'r', encoding='utf-8') as file:
+                return json.load(file)
+
+    default_stopword_file = os.path.join(directory, f"{language}_stopwords.json")
+    if (os.path.exists(default_stopword_file)):
+        with open(default_stopword_file, 'r', encoding='utf-8') as file:
+            return json.load(file)
+
+def clean_volume_data(volume: Volume, stopwords: list[str]):
+    cleaned_data = {}
+
+    for word, count in volume.features.body.items():
+        lower_word = word.lower()
+
+        if lower_word not in stopwords:
+            if lower_word not in cleaned_data:
+                cleaned_data[lower_word] = count
+            else:
+                cleaned_data[lower_word] += count
+    volume.features.body = cleaned_data
+    return volume
+
+def apply_datacleaning(dashboard_id: UUID, filtered_volumes: list[Volume], cleaning_settings: DataCleaningSettings):    
+    cleaned_volumes = []
+    if (cleaning_settings.language):
+        stopwords = load_stopwords(dashboard_id, cleaning_settings.language.lower())
+
+        count = 0
+        for volume in filtered_volumes:
+            log.debug(f"'me' present before cleaning: {'me' in volume.features.body}")
+            cleaned_volume = clean_volume_data(volume, stopwords) 
+            count += 1
+            log.debug(f"'me' present after cleaning: {'me' in cleaned_volume.features.body}")
+            cleaned_volumes.append(cleaned_volume)
+
+        return cleaned_volumes
+    return filtered_volumes
diff --git a/htrc/torchlite/ef/models.py b/htrc/torchlite/ef/models.py
@@ -137,4 +137,4 @@ class VolumeAggFeaturesNoPos(BaseModel):
 class Volume(BaseModel, Generic[FeaturesT]):
     htid: str
     metadata: VolumeMetadata
-    features: FeaturesT | None = None
+    features: VolumeAggFeaturesNoPos | VolumeFeatures | None = None
diff --git a/htrc/torchlite/models/dashboard.py b/htrc/torchlite/models/dashboard.py
@@ -22,11 +22,14 @@ class FilterSettings(BaseModel):
     languages: list[str] = []
     source_institutions: list[str] = []
 
+class DataCleaningSettings(BaseModel):
+    language: str | None = None
 
 class Dashboard(BaseModel, arbitrary_types_allowed=True):
     id: PyUuid = Field(default_factory=uuid.uuid4)
     imported_id: UUID
     filters: FilterSettings | None
+    datacleaning: DataCleaningSettings | None = None
     widgets: list[ALL_WIDGETS]
 
 
@@ -45,12 +48,14 @@ class DashboardCreate(MongoModel):
     description: str | None = None
     imported_id: UUID
     filters: FilterSettings | None = None
+    datacleaning: DataCleaningSettings | None = None
     widgets: conlist(ALL_WIDGETS, min_length=1)
 
 
 class DashboardPatch(MongoModel):
     imported_id: UUID | None = None
     filters: FilterSettings | None = None
+    datacleaning: DataCleaningSettings | None = None
     widgets: list[ALL_WIDGETS] | None = None
     is_shared: bool | None = None
 

diff --git a/htrc/torchlite/routers/dashboards.py b/htrc/torchlite/routers/dashboards.py
@@ -1,7 +1,7 @@
 from uuid import UUID
 
 from authlib.oidc.core import UserInfo
-from fastapi import APIRouter, Depends, HTTPException, status
+from fastapi import APIRouter, Depends, HTTPException, status, UploadFile, File
 from fastapi_cache.decorator import cache
 from fastapi_cache import FastAPICache
 from pymongo import ReturnDocument
@@ -11,13 +11,18 @@
 from ..auth.auth import get_current_user, get_user_access_token
 from ..config import config
 from ..data import apply_filters
+from ..data import apply_datacleaning
 from ..database import mongo_client
 from ..ef.api import ef_api
 from ..errors import TorchliteError
 from ..managers.workset_manager import WorksetManager
 from ..models.dashboard import DashboardSummary, DashboardPatch, DashboardCreate, DashboardPatchUpdate
 from ..models.workset import WorksetIdMapping
 from ..widgets.base import WidgetDataTypes
+import os
+import json
+import csv
+from fastapi.responses import JSONResponse
 from ..ef.exceptions import EfApiError
 
 import logging
@@ -46,6 +51,7 @@ def request_key_builder(func, namespace: str = "", *, request: Request = None, r
 async def list_dashboards(workset_manager: WorksetManager,
                           owner: UUID | None = None,
                           user: UserInfo | None = Depends(get_current_user)) -> list[DashboardSummary]:
+
     if owner == config.TORCHLITE_UID:
         await workset_manager.get_public_worksets()
         workset_manager.get_featured_worksets()
@@ -60,13 +66,13 @@ async def list_dashboards(workset_manager: WorksetManager,
 
     if not user:
         raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED)
-
+    
     user_id = UUID(user.get("htrc-guid", user.sub))
     owner = owner or user_id
-
+    
     if user_id != owner:
         raise HTTPException(status_code=status.HTTP_403_FORBIDDEN)
-
+    
     return await DashboardSummary.from_mongo(
         mongo_client.db["dashboards"].find({"owner": owner}).to_list(1000)
     )
@@ -195,11 +201,11 @@ async def update_dashboard(dashboard_id: UUID,
             log.error(f"Dashboard patch error: {dashboard}")
             raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR)
 
-
 @router.get("/{dashboard_id}/widgets/{widget_type}/data", description="Retrieve widget data")
 @cache(key_builder=request_key_builder)
 async def get_widget_data(dashboard_id: UUID, widget_type: str,
                           user: UserInfo | None = Depends(get_current_user)):
+
     dashboard = await get_dashboard(dashboard_id, user)
 
     # fastapi_cache doesn't seem to preserve pydantic models and instead returns dicts, so converting
@@ -246,11 +252,113 @@ async def get_widget_data(dashboard_id: UUID, widget_type: str,
             status_code=status.HTTP_504_GATEWAY_TIMEOUT,
             detail=f"Server timeout for {imported_id_mapping.workset_id} on request for data for the {widget_type} widget"
         )
-
+    #check widget type and perform cleaning
     filtered_volumes = apply_filters(volumes, filters=dashboard.filters)
+    log.debug(f"Total volumes before cleaning: filtered_volumes {len(filtered_volumes)}")
+    if (widget.data_type == WidgetDataTypes.metadata_only):
+        return await widget.get_data(filtered_volumes)
+    cleaned_volumes = apply_datacleaning(dashboard_id, filtered_volumes, cleaning_settings=dashboard.datacleaning)
+    log.debug(f"Total volumes to clean: cleaned_volumes {len(cleaned_volumes)}")
+    return await widget.get_data(cleaned_volumes)
 
-    return await widget.get_data(filtered_volumes)
 
+#dashboard_id/stopwords
+@router.get("/{dashboard_id}/stopwords/{language}", description="Retrieve Stopwords data")
+async def get_stopwords_data(dashboard_id: UUID, language: str,
+                          user: UserInfo | None = Depends(get_current_user)):
+
+    dashboard = await get_dashboard(dashboard_id, user)
+    if not dashboard:
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail=f"Dashboard {dashboard_id} not found"
+        )
+
+    # Normalize language input
+    print(dashboard_id,language)
+    language = language.lower().strip()
+    directory="stopword_lists"
+    stopword_file_path = os.path.join(directory, f"{language}_stopwords.json")
+
+    if not os.path.exists(stopword_file_path):
+        print(f"Stopwords file for language '{language}' not found.")
+
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail=f"Stopwords file for language '{language}' not found"
+        )
+
+    try:
+        # Read and parse JSON file
+        with open(stopword_file_path, 'r', encoding='utf-8') as file:
+            stopwords_data = json.load(file)
+
+        # Return JSON response
+        return JSONResponse(
+            content=stopwords_data,
+            media_type="application/json"
+        )
+    except json.JSONDecodeError:
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Error parsing stopwords JSON file for language '{language}'"
+        )
+
+
+@router.post("/{dashboard_id}/stopwords", description="Upload stopwords file", response_model_exclude_defaults=True)
+async def upload_stopwords(dashboard_id: UUID,
+                           user: UserInfo | None = Depends(get_current_user),
+                           file: UploadFile = File(...)) -> DashboardSummary:
+    dashboard = await get_dashboard(dashboard_id, user)
+    print("Pooja")
+    if not dashboard:
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail=f"Dashboard {dashboard_id} not found"
+        )
+
+    os.makedirs("stopword_lists", exist_ok=True)
+    file_path = os.path.join("stopword_lists", f"{dashboard_id}_stopwords.json")
+    try:
+        # Determine file type based on the filename
+        filename = file.filename.lower()
+        content = await file.read()  # Read file content as bytes
+        stopwords_list = []
+
+        if filename.endswith(".txt"):
+            # Handle TXT file: split content into lines
+            stopwords_list = content.decode("utf-8").splitlines()
+            stopwords_list = [line.strip() for line in stopwords_list if line.strip()]
+
+        elif filename.endswith(".csv"):
+            # Handle CSV file: parse rows and extract stopwords
+            decoded_content = content.decode("utf-8").splitlines()
+            csv_reader = csv.reader(decoded_content)
+            for row in csv_reader:
+                stopwords_list.extend([word.strip() for word in row if word.strip()])
+
+        else:
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail="Unsupported file format. Please upload a .txt or .csv file."
+            )
+
+        # Save the stopwords as a JSON array
+        with open(file_path, "w", encoding="utf-8") as json_file:
+            json.dump(stopwords_list, json_file, ensure_ascii=False, indent=4)
+
+        return {
+            "message": "Stopwords uploaded and processed successfully",
+            "file_path": file_path,
+            "filters": {},      
+            "widgets": [],       
+            "owner": dashboard_id  
+        }
+    except Exception as e:
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Error processing stopwords file: {e}"
+        )
 
 @router.get("/{dashboard_id}/{data_type}", description="Retrieve workset data or metadata")
 @cache(key_builder=request_key_builder)