Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
6d2f92d
Apply stopwords backend logic
Oct 17, 2024
1cfc6c1
merge conflicts
Oct 18, 2024
b3bb80e
Backend logic for stopwords
Oct 23, 2024
b748f0d
Backend logic for stopwords
Oct 25, 2024
f1c950d
Added nltk library
Nov 4, 2024
adef532
Pass cleaned volumes
Nov 7, 2024
c22b8a0
Pass cleaned volumes
Nov 7, 2024
d0b3453
Pass cleaned volumes
Nov 7, 2024
4c6fd1a
Download stopwords
Nov 10, 2024
bfd6ff7
Download stopwords
Nov 10, 2024
dce5056
Download stopwords
Nov 12, 2024
b2d0a1a
Download stopwords
Nov 13, 2024
36d7962
Download stopwords
Nov 13, 2024
a1a12d5
Download stopwords
Nov 13, 2024
c035138
Backend changes
Dec 10, 2024
2e02ff1
Backend changes
Dec 11, 2024
1a0ce8a
Backend changes
Dec 11, 2024
6aa3a6f
Backend changes
Dec 11, 2024
5f81b58
Backend changes
Dec 11, 2024
5606a5f
Backend changes
Dec 11, 2024
650206f
Backend changes
Dec 11, 2024
8dc1919
Backend changes
Dec 11, 2024
8b51cf3
Backend changes
Dec 19, 2024
143c565
Merge branch 'develop' into applystopwordds
dkudeki Jan 7, 2025
a776b83
Added the stopword files (how did I get these if they weren't part of…
dkudeki Jan 13, 2025
3b528f0
Merge branch 'develop' into applystopwordds
dkudeki Mar 3, 2025
3d0f44e
Fixing lint errors, updating version in pyproject
dkudeki Mar 3, 2025
7069434
Cleanup changes, stop overwriting collisions when applying stopwords
dkudeki Mar 4, 2025
4e369c6
Merge branch 'develop' into applystopwordds
dkudeki May 20, 2025
dbd9c8c
Update poetry.lock
dkudeki May 20, 2025
d85f19b
AggFeatures type needs to come first, check if language is set in cle…
dkudeki May 22, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 64 additions & 0 deletions htrc/torchlite/data.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,18 @@
from uuid import UUID

import nltk
from .converters import torchlite_volume_meta_from_ef
from .ef.models import Volume
from .models.dashboard import FilterSettings
from .models.dashboard import DataCleaningSettings
from .utils import make_set
from .config import config
from nltk.corpus import stopwords
import os
import json
import logging

log = logging.getLogger(config.PROJECT_NAME)

def apply_filters(volumes: list[Volume], filters: FilterSettings) -> list[Volume]:
filtered_volumes = []
Expand Down Expand Up @@ -34,3 +44,57 @@ def apply_filters(volumes: list[Volume], filters: FilterSettings) -> list[Volume
filtered_volumes.append(volume)

return filtered_volumes

def load_stopwords(dashboard_id: UUID, language: str, directory="stopword_lists"):
nltk.download('stopwords')
default_languages = ['english', 'german', 'spanish', 'french']
if not os.path.exists(directory):
os.makedirs(directory)
for lang in default_languages:
stopword_list = stopwords.words(lang)
stopword_file_path = os.path.join(directory, f"{lang}_stopwords.json")
with open(stopword_file_path, 'w', encoding='utf-8') as file:
json.dump(stopword_list, file, ensure_ascii=False, indent=4)

stopword_file_path = os.path.join(directory, f"{dashboard_id}_stopwords.json")
log.debug(stopword_file_path)
log.debug(os.path.exists(stopword_file_path))
if lang not in ["English", "German", "French", "Spanish"]:
if (os.path.exists(stopword_file_path)):
with open(stopword_file_path, 'r', encoding='utf-8') as file:
return json.load(file)

default_stopword_file = os.path.join(directory, f"{language}_stopwords.json")
if (os.path.exists(default_stopword_file)):
with open(default_stopword_file, 'r', encoding='utf-8') as file:
return json.load(file)

def clean_volume_data(volume: Volume, stopwords: list[str]):
cleaned_data = {}

for word, count in volume.features.body.items():
lower_word = word.lower()

if lower_word not in stopwords:
if lower_word not in cleaned_data:
cleaned_data[lower_word] = count
else:
cleaned_data[lower_word] += count
volume.features.body = cleaned_data
return volume

def apply_datacleaning(dashboard_id: UUID, filtered_volumes: list[Volume], cleaning_settings: DataCleaningSettings):
cleaned_volumes = []
if (cleaning_settings.language):
stopwords = load_stopwords(dashboard_id, cleaning_settings.language.lower())

count = 0
for volume in filtered_volumes:
log.debug(f"'me' present before cleaning: {'me' in volume.features.body}")
cleaned_volume = clean_volume_data(volume, stopwords)
count += 1
log.debug(f"'me' present after cleaning: {'me' in cleaned_volume.features.body}")
cleaned_volumes.append(cleaned_volume)

return cleaned_volumes
return filtered_volumes
2 changes: 1 addition & 1 deletion htrc/torchlite/ef/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,4 +137,4 @@ class VolumeAggFeaturesNoPos(BaseModel):
class Volume(BaseModel, Generic[FeaturesT]):
htid: str
metadata: VolumeMetadata
features: FeaturesT | None = None
features: VolumeAggFeaturesNoPos | VolumeFeatures | None = None
5 changes: 5 additions & 0 deletions htrc/torchlite/models/dashboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,14 @@ class FilterSettings(BaseModel):
languages: list[str] = []
source_institutions: list[str] = []

class DataCleaningSettings(BaseModel):
language: str | None = None

class Dashboard(BaseModel, arbitrary_types_allowed=True):
id: PyUuid = Field(default_factory=uuid.uuid4)
imported_id: UUID
filters: FilterSettings | None
datacleaning: DataCleaningSettings | None = None
widgets: list[ALL_WIDGETS]


Expand All @@ -45,12 +48,14 @@ class DashboardCreate(MongoModel):
description: str | None = None
imported_id: UUID
filters: FilterSettings | None = None
datacleaning: DataCleaningSettings | None = None
widgets: conlist(ALL_WIDGETS, min_length=1)


class DashboardPatch(MongoModel):
imported_id: UUID | None = None
filters: FilterSettings | None = None
datacleaning: DataCleaningSettings | None = None
widgets: list[ALL_WIDGETS] | None = None
is_shared: bool | None = None

Expand Down
122 changes: 115 additions & 7 deletions htrc/torchlite/routers/dashboards.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from uuid import UUID

from authlib.oidc.core import UserInfo
from fastapi import APIRouter, Depends, HTTPException, status
from fastapi import APIRouter, Depends, HTTPException, status, UploadFile, File
from fastapi_cache.decorator import cache
from fastapi_cache import FastAPICache
from pymongo import ReturnDocument
Expand All @@ -11,13 +11,18 @@
from ..auth.auth import get_current_user, get_user_access_token
from ..config import config
from ..data import apply_filters
from ..data import apply_datacleaning
from ..database import mongo_client
from ..ef.api import ef_api
from ..errors import TorchliteError
from ..managers.workset_manager import WorksetManager
from ..models.dashboard import DashboardSummary, DashboardPatch, DashboardCreate, DashboardPatchUpdate
from ..models.workset import WorksetIdMapping
from ..widgets.base import WidgetDataTypes
import os
import json
import csv
from fastapi.responses import JSONResponse
from ..ef.exceptions import EfApiError

import logging
Expand Down Expand Up @@ -46,6 +51,7 @@ def request_key_builder(func, namespace: str = "", *, request: Request = None, r
async def list_dashboards(workset_manager: WorksetManager,
owner: UUID | None = None,
user: UserInfo | None = Depends(get_current_user)) -> list[DashboardSummary]:

if owner == config.TORCHLITE_UID:
await workset_manager.get_public_worksets()
workset_manager.get_featured_worksets()
Expand All @@ -60,13 +66,13 @@ async def list_dashboards(workset_manager: WorksetManager,

if not user:
raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED)

user_id = UUID(user.get("htrc-guid", user.sub))
owner = owner or user_id

if user_id != owner:
raise HTTPException(status_code=status.HTTP_403_FORBIDDEN)

return await DashboardSummary.from_mongo(
mongo_client.db["dashboards"].find({"owner": owner}).to_list(1000)
)
Expand Down Expand Up @@ -195,11 +201,11 @@ async def update_dashboard(dashboard_id: UUID,
log.error(f"Dashboard patch error: {dashboard}")
raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR)


@router.get("/{dashboard_id}/widgets/{widget_type}/data", description="Retrieve widget data")
@cache(key_builder=request_key_builder)
async def get_widget_data(dashboard_id: UUID, widget_type: str,
user: UserInfo | None = Depends(get_current_user)):

dashboard = await get_dashboard(dashboard_id, user)

# fastapi_cache doesn't seem to preserve pydantic models and instead returns dicts, so converting
Expand Down Expand Up @@ -246,11 +252,113 @@ async def get_widget_data(dashboard_id: UUID, widget_type: str,
status_code=status.HTTP_504_GATEWAY_TIMEOUT,
detail=f"Server timeout for {imported_id_mapping.workset_id} on request for data for the {widget_type} widget"
)

#check widget type and perform cleaning
filtered_volumes = apply_filters(volumes, filters=dashboard.filters)
log.debug(f"Total volumes before cleaning: filtered_volumes {len(filtered_volumes)}")
if (widget.data_type == WidgetDataTypes.metadata_only):
return await widget.get_data(filtered_volumes)
cleaned_volumes = apply_datacleaning(dashboard_id, filtered_volumes, cleaning_settings=dashboard.datacleaning)
log.debug(f"Total volumes to clean: cleaned_volumes {len(cleaned_volumes)}")
return await widget.get_data(cleaned_volumes)

return await widget.get_data(filtered_volumes)

#dashboard_id/stopwords
@router.get("/{dashboard_id}/stopwords/{language}", description="Retrieve Stopwords data")
async def get_stopwords_data(dashboard_id: UUID, language: str,
user: UserInfo | None = Depends(get_current_user)):

dashboard = await get_dashboard(dashboard_id, user)
if not dashboard:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=f"Dashboard {dashboard_id} not found"
)

# Normalize language input
print(dashboard_id,language)
language = language.lower().strip()
directory="stopword_lists"
stopword_file_path = os.path.join(directory, f"{language}_stopwords.json")

if not os.path.exists(stopword_file_path):
print(f"Stopwords file for language '{language}' not found.")

raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=f"Stopwords file for language '{language}' not found"
)

try:
# Read and parse JSON file
with open(stopword_file_path, 'r', encoding='utf-8') as file:
stopwords_data = json.load(file)

# Return JSON response
return JSONResponse(
content=stopwords_data,
media_type="application/json"
)
except json.JSONDecodeError:
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Error parsing stopwords JSON file for language '{language}'"
)


@router.post("/{dashboard_id}/stopwords", description="Upload stopwords file", response_model_exclude_defaults=True)
async def upload_stopwords(dashboard_id: UUID,
user: UserInfo | None = Depends(get_current_user),
file: UploadFile = File(...)) -> DashboardSummary:
dashboard = await get_dashboard(dashboard_id, user)
print("Pooja")
if not dashboard:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=f"Dashboard {dashboard_id} not found"
)

os.makedirs("stopword_lists", exist_ok=True)
file_path = os.path.join("stopword_lists", f"{dashboard_id}_stopwords.json")
try:
# Determine file type based on the filename
filename = file.filename.lower()
content = await file.read() # Read file content as bytes
stopwords_list = []

if filename.endswith(".txt"):
# Handle TXT file: split content into lines
stopwords_list = content.decode("utf-8").splitlines()
stopwords_list = [line.strip() for line in stopwords_list if line.strip()]

elif filename.endswith(".csv"):
# Handle CSV file: parse rows and extract stopwords
decoded_content = content.decode("utf-8").splitlines()
csv_reader = csv.reader(decoded_content)
for row in csv_reader:
stopwords_list.extend([word.strip() for word in row if word.strip()])

else:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="Unsupported file format. Please upload a .txt or .csv file."
)

# Save the stopwords as a JSON array
with open(file_path, "w", encoding="utf-8") as json_file:
json.dump(stopwords_list, json_file, ensure_ascii=False, indent=4)

return {
"message": "Stopwords uploaded and processed successfully",
"file_path": file_path,
"filters": {},
"widgets": [],
"owner": dashboard_id
}
except Exception as e:
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Error processing stopwords file: {e}"
)

@router.get("/{dashboard_id}/{data_type}", description="Retrieve workset data or metadata")
@cache(key_builder=request_key_builder)
Expand Down
Loading
Loading