From a3eef1a24035242dba3cb700ea93479fdd30e84b Mon Sep 17 00:00:00 2001 From: HmbleCreator Date: Sun, 29 Jun 2025 20:55:01 +0530 Subject: [PATCH 1/5] feat(ui): add Token Estimator link to footer - Adds a "Token Estimator" link (with icon) to the left column of the site footer. - Link points to https://gitingest.com/tokencount and opens in a new tab. - Styled consistently with other resource links. - No changes to CLI or Python package. Closes #318 --- src/server/templates/components/footer.jinja | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/server/templates/components/footer.jinja b/src/server/templates/components/footer.jinja index eb561677..8dd3cb8c 100644 --- a/src/server/templates/components/footer.jinja +++ b/src/server/templates/components/footer.jinja @@ -21,6 +21,15 @@ class="w-4 h-4 mr-1"> Python package + + token estimator + Token Estimator +
From 6247f92222fbacdf610fa1aec9e07576ea676a1b Mon Sep 17 00:00:00 2001 From: HmbleCreator Date: Mon, 30 Jun 2025 14:35:29 +0530 Subject: [PATCH 2/5] feat: Add token estimator documentation page and simplify footer link text - Add GET endpoint for /api/tokencount with interactive documentation - Include API usage examples and interactive form for testing --- src/server/routers/index.py | 176 ++++++++++++++++++- src/server/templates/components/footer.jinja | 9 +- 2 files changed, 175 insertions(+), 10 deletions(-) diff --git a/src/server/routers/index.py b/src/server/routers/index.py index 9385d6ff..cc1cf80c 100644 --- a/src/server/routers/index.py +++ b/src/server/routers/index.py @@ -1,16 +1,64 @@ """Module defining the FastAPI router for the home page of the application.""" -from fastapi import APIRouter, Depends, Request -from fastapi.responses import HTMLResponse +from fastapi import APIRouter, Depends, Request, Form, HTTPException, Body +from fastapi.responses import HTMLResponse, JSONResponse, RedirectResponse +from fastapi.templating import Jinja2Templates +from autotiktokenizer import AutoTikTokenizer +import tiktoken +from pydantic import BaseModel, Field +from typing import Optional from gitingest.utils.compat_typing import Annotated from server.models import QueryForm from server.query_processor import process_query -from server.server_config import EXAMPLE_REPOS, templates +from server.server_config import EXAMPLE_REPOS from server.server_utils import limiter router = APIRouter() +templates = Jinja2Templates(directory="server/templates") + +SUPPORTED_MODELS = { + 'GPT-2 (OpenAI)': 'openai-community/gpt2', + 'GPT-3 (OpenAI)': 'openai-community/gpt2', + 'GPT-3.5 (OpenAI)': 'openai-community/gpt2', + 'GPT-3.5-turbo (OpenAI)': 'openai-community/gpt2', + 'GPT-4 (OpenAI)': 'openai-community/gpt2', + 'Claude (approximate, uses GPT-2)': 'openai-community/gpt2', + 'Gemini (approximate, uses T5)': 't5-base', + 'Llama-2 (Meta)': 'meta-llama/Llama-2-7b-hf', + 'Llama-3 (Meta)': 'meta-llama/Meta-Llama-3-8B', + 'Mistral-7B (MistralAI)': 'mistralai/Mistral-7B-v0.1', + 'Mixtral-8x7B (MistralAI)': 'mistralai/Mixtral-8x7B-v0.1', + 'Phi-3-mini (Microsoft)': 'microsoft/phi-3-mini-4k-instruct', + 'Gemma-2B (Google)': 'google/gemma-2b', + 'Qwen2-7B (Alibaba)': 'Qwen/Qwen2-7B', + 'Yi-34B (01.AI)': '01-ai/Yi-34B-Chat', + 'Falcon-7B (TII)': 'tiiuae/falcon-7b', + 'MPT-7B (MosaicML)': 'mosaicml/mpt-7b', + 'Baichuan-7B (Baichuan)': 'baichuan-inc/Baichuan-7B', + 'XLM-RoBERTa-base (Facebook)': 'xlm-roberta-base', + 'RoBERTa-base (Facebook)': 'roberta-base', + 'DistilBERT-base-uncased': 'distilbert-base-uncased', + 'GPT-Neo-1.3B (EleutherAI)': 'EleutherAI/gpt-neo-1.3B', + 'GPT-J-6B (EleutherAI)': 'EleutherAI/gpt-j-6B', + 'GPT-Bloom-560m (BigScience)': 'bigscience/bloom-560m', + 'BERT-base-uncased': 'bert-base-uncased', + 'T5-base': 't5-base', +} +# Note: Gemini and Claude use approximate tokenizers (T5 and GPT-2, respectively) as no official public tokenizers exist for these models. + +def get_tokenizer(model_id): + return AutoTikTokenizer.from_pretrained(model_id) + +def count_tokens(input_text, model_id): + if model_id == 'openai-community/gpt2': + # Use tiktoken for OpenAI models + enc = tiktoken.encoding_for_model("gpt-3.5-turbo") + return len(enc.encode(input_text)) + else: + tokenizer = AutoTikTokenizer.from_pretrained(model_id) + return len(tokenizer.encode(input_text)) @router.get("/", response_class=HTMLResponse) async def home(request: Request) -> HTMLResponse: @@ -74,3 +122,125 @@ async def index_post(request: Request, form: Annotated[QueryForm, Depends(QueryF is_index=True, token=resolved_token, ) + + +class TokenCountRequest(BaseModel): + input_text: str = Field(..., description="The text to count tokens for") + model_id: str = Field(default="openai-community/gpt2", description="The model ID to use for tokenization") + +class TokenCountResponse(BaseModel): + token_count: int = Field(..., description="Number of tokens in the input text") + model_id: str = Field(..., description="Model ID used for tokenization") + character_count: int = Field(..., description="Number of characters in the input text") + +@router.post("/api/tokencount", response_model=TokenCountResponse) +async def api_token_count( + request: Optional[TokenCountRequest] = None, + input_text: str = Form(None), + model_id: str = Form(default="openai-community/gpt2"), +): + """ + Count tokens in the provided text using the specified model's tokenizer. + Accepts both JSON and form data. + """ + # If JSON body was provided, use that + if request: + text = request.input_text + model = request.model_id + # Otherwise use form data + else: + text = input_text + model = model_id + + if not text or not text.strip(): + raise HTTPException(status_code=400, detail="Input text cannot be empty") + + if model not in SUPPORTED_MODELS.values(): + raise HTTPException( + status_code=400, + detail=f"Unsupported model ID. Must be one of: {', '.join(SUPPORTED_MODELS.values())}" + ) + + try: + token_count = count_tokens(text, model) + return TokenCountResponse( + token_count=token_count, + model_id=model, + character_count=len(text) + ) + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + +@router.get("/tokencount", response_class=HTMLResponse) +async def tokencount_ui(request: Request): + return templates.TemplateResponse( + "tokencount.jinja", + {"request": request, "supported_models": SUPPORTED_MODELS} + ) + +@router.get("/api/tokencount", response_class=HTMLResponse) +async def api_token_count_docs(): + """Serve a simple documentation page for the token count API.""" + html = f''' + + + Token Estimator API + + + +

Token Estimator API

+

This API endpoint counts tokens for various AI language models.

+ +

Endpoint

+

POST /api/tokencount

+ +

Request Format

+

Send a POST request with either:

+

JSON Body:

+
{{
+    "input_text": "Your text here",
+    "model_id": "openai-community/gpt2"  // optional
+}}
+ +

Or Form Data:

+
input_text: Your text here
+model_id: openai-community/gpt2  // optional
+ +

Response Format

+
{{
+    "token_count": 123,
+    "model_id": "openai-community/gpt2",
+    "character_count": 456
+}}
+ +

Try it out

+
+

+

+

+
+ +

Example using curl

+
curl -X POST http://localhost:8000/api/tokencount \\
+    -H "Content-Type: application/json" \\
+    -d '{{"input_text": "Hello, world!", "model_id": "openai-community/gpt2"}}'
+ + + ''' + return HTMLResponse(content=html) diff --git a/src/server/templates/components/footer.jinja b/src/server/templates/components/footer.jinja index 8dd3cb8c..ac4d5eca 100644 --- a/src/server/templates/components/footer.jinja +++ b/src/server/templates/components/footer.jinja @@ -21,13 +21,8 @@ class="w-4 h-4 mr-1"> Python package - - token estimator + + token estimator Token Estimator
From cef47a2982b20656fbda4808ab0af2cb5c3c1e3c Mon Sep 17 00:00:00 2001 From: HmbleCreator Date: Mon, 30 Jun 2025 14:35:51 +0530 Subject: [PATCH 3/5] feat: Add token estimator documentation page and simplify footer link text - Add GET endpoint for /api/tokencount with interactive documentation - Include API usage examples and interactive form for testing --- tests/test_server.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 tests/test_server.py diff --git a/tests/test_server.py b/tests/test_server.py new file mode 100644 index 00000000..bb1f229b --- /dev/null +++ b/tests/test_server.py @@ -0,0 +1,23 @@ +from fastapi.testclient import TestClient +from src.server.main import app + +client = TestClient(app, base_url="http://localhost") + + +def test_tokencount_valid(): + response = client.post("/api/tokencount", json={"input_text": "Hello world!", "model_id": "openai-community/gpt2"}, headers={"host": "localhost"}) + if response.status_code != 200: + print("Response content:", response.content) + assert response.status_code == 200 + data = response.json() + assert "token_count" in data + assert isinstance(data["token_count"], int) + assert data["token_count"] > 0 + +def test_tokencount_missing_input(): + response = client.post("/api/tokencount", json={"model_id": "openai-community/gpt2"}, headers={"host": "localhost"}) + if response.status_code != 400: + print("Response content:", response.content) + assert response.status_code == 400 + data = response.json() + assert "error" in data \ No newline at end of file From 48328ab0e42c595f3ec7bdd8e7c580b46a0a88b1 Mon Sep 17 00:00:00 2001 From: HmbleCreator Date: Mon, 30 Jun 2025 18:09:01 +0530 Subject: [PATCH 4/5] feat: use Jinja template for token estimator API docs - Add tokencount_api.jinja for styled token estimator documentation - Update /api/tokencount GET endpoint to render the new template - Documentation now matches the look and feel of other API docs --- src/server/routers/index.py | 70 ++--------------------- src/server/templates/tokencount_api.jinja | 60 +++++++++++++++++++ 2 files changed, 65 insertions(+), 65 deletions(-) create mode 100644 src/server/templates/tokencount_api.jinja diff --git a/src/server/routers/index.py b/src/server/routers/index.py index cc1cf80c..8d051a6e 100644 --- a/src/server/routers/index.py +++ b/src/server/routers/index.py @@ -179,68 +179,8 @@ async def tokencount_ui(request: Request): ) @router.get("/api/tokencount", response_class=HTMLResponse) -async def api_token_count_docs(): - """Serve a simple documentation page for the token count API.""" - html = f''' - - - Token Estimator API - - - -

Token Estimator API

-

This API endpoint counts tokens for various AI language models.

- -

Endpoint

-

POST /api/tokencount

- -

Request Format

-

Send a POST request with either:

-

JSON Body:

-
{{
-    "input_text": "Your text here",
-    "model_id": "openai-community/gpt2"  // optional
-}}
- -

Or Form Data:

-
input_text: Your text here
-model_id: openai-community/gpt2  // optional
- -

Response Format

-
{{
-    "token_count": 123,
-    "model_id": "openai-community/gpt2",
-    "character_count": 456
-}}
- -

Try it out

-
-

-

-

-
- -

Example using curl

-
curl -X POST http://localhost:8000/api/tokencount \\
-    -H "Content-Type: application/json" \\
-    -d '{{"input_text": "Hello, world!", "model_id": "openai-community/gpt2"}}'
- - - ''' - return HTMLResponse(content=html) +async def api_token_count_docs(request: Request): + return templates.TemplateResponse( + "tokencount_api.jinja", + {"request": request} + ) diff --git a/src/server/templates/tokencount_api.jinja b/src/server/templates/tokencount_api.jinja new file mode 100644 index 00000000..9796236f --- /dev/null +++ b/src/server/templates/tokencount_api.jinja @@ -0,0 +1,60 @@ +{% extends "base.jinja" %} +{% block title %}Token Estimator API{% endblock %} +{% block content %} +
+
+
+

Token Estimator API

+
+
+
+
+ + + +
+
+

Count tokens for OpenAI and open-source models using fast tokenizers.

+
+
+
+

Endpoint

+

POST /api/tokencount

+

Request Format

+

Send a POST request with either:

+

JSON Body:

+
{
+    "input_text": "Your text here",
+    "model_id": "openai-community/gpt2"  // optional
+}
+

Or Form Data:

+
input_text: Your text here
+model_id: openai-community/gpt2  // optional
+

Response Format

+
{
+    "token_count": 123,
+    "model_id": "openai-community/gpt2",
+    "character_count": 456
+}
+

Supported Models

+
    +
  • OpenAI models (fast, via tiktokenizer)
  • +
  • Open-source models (fast, via autotiktokenizer)
  • +
+

Try it out

+
+ + + + + +
+

Example using curl

+
curl -X POST http://localhost:8000/api/tokencount \
+    -H "Content-Type: application/json" \
+    -d '{"input_text": "Hello, world!", "model_id": "openai-community/gpt2"}'
+                
+
+
+
+{% endblock %} \ No newline at end of file From 98e5c458df509953ffdf888fdc1b40345c1eb3cc Mon Sep 17 00:00:00 2001 From: HmbleCreator Date: Tue, 1 Jul 2025 08:36:22 +0530 Subject: [PATCH 5/5] Refactor token count API and routing, enforce API-only for /api/tokencount, and backend cleanup - Removed GET /api/tokencount route that rendered a Jinja template; /api/tokencount is now POST-only and returns JSON. - Renamed and refactored tokencount_api.jinja to tokencount.jinja, using a select dropdown for model selection and matching git_form.jinja UX. - Updated footer link to point to /tokencount (user-facing form) instead of /api/tokencount. - Ensured /tokencount is the user-facing form (GET/POST, Jinja) and /api/tokencount is API-only (POST, JSON). - Updated dependency management: added autotiktokenizer to requirements.txt and pyproject.toml, and added a guideline to CONTRIBUTING.md. - Cleaned up unused imports and dead code in backend. --- CONTRIBUTING.md | 4 ++ pyproject.toml | 3 +- requirements.txt | 1 + src/server/routers/dynamic.py | 4 +- src/server/routers/index.py | 39 ++++++++++--- src/server/templates/components/footer.jinja | 2 +- src/server/templates/tokencount.jinja | 40 +++++++++++++ src/server/templates/tokencount_api.jinja | 60 -------------------- tests/test_server.py | 4 +- 9 files changed, 84 insertions(+), 73 deletions(-) create mode 100644 src/server/templates/tokencount.jinja delete mode 100644 src/server/templates/tokencount_api.jinja diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 3ece5d35..9b5fc51a 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -85,3 +85,7 @@ Thanks for your interest in contributing to Gitingest! 🚀 Gitingest aims to be 13. Wait for the maintainers to review your pull request. If there are any issues, fix them and repeat steps 6 to 12. *(Optional) Invite project maintainer to your branch for easier collaboration.* + +## Dependency Management + +When you add a new import from an external package, make sure to add it to both `requirements.txt` and `pyproject.toml` (if applicable). This ensures all environments and CI/CD pipelines have the correct dependencies installed. diff --git a/pyproject.toml b/pyproject.toml index 5171396d..9edeb0ad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,7 @@ dependencies = [ "pathspec>=0.12.1", "typing_extensions>= 4.0.0; python_version < '3.10'", "uvicorn>=0.11.7", # Minimum safe release (https://osv.dev/vulnerability/PYSEC-2020-150) + "autotiktokenizer=*", ] license = {file = "LICENSE"} @@ -80,7 +81,7 @@ ignore = [ # https://docs.astral.sh/ruff/rules/... "BLE001", # blind-except, TODO: replace with specific exceptions "FAST003", # fast-api-unused-path-parameter, TODO: fix ] -per-file-ignores = { "tests/**/*.py" = ["S101"] } # Skip the “assert used” warning +per-file-ignores = { "tests/**/*.py" = ["S101"] } # Skip the "assert used" warning [tool.ruff.lint.pylint] max-returns = 10 diff --git a/requirements.txt b/requirements.txt index 74042c48..b30d81f7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,4 @@ slowapi starlette>=0.40.0 # Vulnerable to https://osv.dev/vulnerability/GHSA-f96h-pmfr-66vw tiktoken>=0.7.0 # Support for o200k_base encoding uvicorn>=0.11.7 # Vulnerable to https://osv.dev/vulnerability/PYSEC-2020-150 +autotiktokenizer diff --git a/src/server/routers/dynamic.py b/src/server/routers/dynamic.py index 5ed36fe4..8e563c3b 100644 --- a/src/server/routers/dynamic.py +++ b/src/server/routers/dynamic.py @@ -1,6 +1,6 @@ """The dynamic router module defines handlers for dynamic path requests.""" -from fastapi import APIRouter, Depends, Request +from fastapi import APIRouter, Depends, Request, HTTPException from fastapi.responses import HTMLResponse from gitingest.utils.compat_typing import Annotated @@ -33,6 +33,8 @@ async def catch_all(request: Request, full_path: str) -> HTMLResponse: and other default parameters such as loading state and file size. """ + if full_path.startswith("api/"): + raise HTTPException(status_code=405, detail="Method Not Allowed") return templates.TemplateResponse( "git.jinja", { diff --git a/src/server/routers/index.py b/src/server/routers/index.py index 8d051a6e..cb7cf053 100644 --- a/src/server/routers/index.py +++ b/src/server/routers/index.py @@ -1,11 +1,10 @@ """Module defining the FastAPI router for the home page of the application.""" -from fastapi import APIRouter, Depends, Request, Form, HTTPException, Body -from fastapi.responses import HTMLResponse, JSONResponse, RedirectResponse +from fastapi import APIRouter, Depends, Request, Form, HTTPException +from fastapi.responses import HTMLResponse from fastapi.templating import Jinja2Templates from autotiktokenizer import AutoTikTokenizer import tiktoken -from pydantic import BaseModel, Field from typing import Optional from gitingest.utils.compat_typing import Annotated @@ -13,6 +12,7 @@ from server.query_processor import process_query from server.server_config import EXAMPLE_REPOS from server.server_utils import limiter +from pydantic import BaseModel, Field router = APIRouter() @@ -175,12 +175,35 @@ async def api_token_count( async def tokencount_ui(request: Request): return templates.TemplateResponse( "tokencount.jinja", - {"request": request, "supported_models": SUPPORTED_MODELS} + {"request": request, "supported_models": SUPPORTED_MODELS, "input_text": "", "model_id": "openai-community/gpt2", "result": None, "error": None} ) -@router.get("/api/tokencount", response_class=HTMLResponse) -async def api_token_count_docs(request: Request): +@router.post("/tokencount", response_class=HTMLResponse) +async def tokencount_post(request: Request, input_text: str = Form(...), model_id: str = Form("openai-community/gpt2")): + error = None + result = None + if not input_text or not input_text.strip(): + error = "Input text cannot be empty." + elif model_id not in SUPPORTED_MODELS.values(): + error = f"Unsupported model ID. Must be one of: {', '.join(SUPPORTED_MODELS.values())}" + else: + try: + token_count = count_tokens(input_text, model_id) + result = { + "token_count": token_count, + "model_id": model_id, + "character_count": len(input_text) + } + except Exception as e: + error = str(e) return templates.TemplateResponse( - "tokencount_api.jinja", - {"request": request} + "tokencount.jinja", + { + "request": request, + "supported_models": SUPPORTED_MODELS, + "input_text": input_text, + "model_id": model_id, + "result": result, + "error": error + } ) diff --git a/src/server/templates/components/footer.jinja b/src/server/templates/components/footer.jinja index ac4d5eca..581fb149 100644 --- a/src/server/templates/components/footer.jinja +++ b/src/server/templates/components/footer.jinja @@ -21,7 +21,7 @@ class="w-4 h-4 mr-1"> Python package - + token estimator Token Estimator diff --git a/src/server/templates/tokencount.jinja b/src/server/templates/tokencount.jinja new file mode 100644 index 00000000..46a77f81 --- /dev/null +++ b/src/server/templates/tokencount.jinja @@ -0,0 +1,40 @@ +{% extends "base.jinja" %} +{% block title %}Token Estimator{% endblock %} +{% block content %} +
+
+
+

Token Estimator

+
+
+ + +
+
+ + +
+
+ +
+
+ {% if result %} +
+

Result

+

Token count: {{ result.token_count }}

+

Character count: {{ result.character_count }}

+

Model: {{ result.model_id }}

+
+ {% endif %} + {% if error %} +
+ Error: {{ error }} +
+ {% endif %} +
+
+{% endblock %} \ No newline at end of file diff --git a/src/server/templates/tokencount_api.jinja b/src/server/templates/tokencount_api.jinja deleted file mode 100644 index 9796236f..00000000 --- a/src/server/templates/tokencount_api.jinja +++ /dev/null @@ -1,60 +0,0 @@ -{% extends "base.jinja" %} -{% block title %}Token Estimator API{% endblock %} -{% block content %} -
-
-
-

Token Estimator API

-
-
-
-
- - - -
-
-

Count tokens for OpenAI and open-source models using fast tokenizers.

-
-
-
-

Endpoint

-

POST /api/tokencount

-

Request Format

-

Send a POST request with either:

-

JSON Body:

-
{
-    "input_text": "Your text here",
-    "model_id": "openai-community/gpt2"  // optional
-}
-

Or Form Data:

-
input_text: Your text here
-model_id: openai-community/gpt2  // optional
-

Response Format

-
{
-    "token_count": 123,
-    "model_id": "openai-community/gpt2",
-    "character_count": 456
-}
-

Supported Models

-
    -
  • OpenAI models (fast, via tiktokenizer)
  • -
  • Open-source models (fast, via autotiktokenizer)
  • -
-

Try it out

-
- - - - - -
-

Example using curl

-
curl -X POST http://localhost:8000/api/tokencount \
-    -H "Content-Type: application/json" \
-    -d '{"input_text": "Hello, world!", "model_id": "openai-community/gpt2"}'
-                
-
-
-
-{% endblock %} \ No newline at end of file diff --git a/tests/test_server.py b/tests/test_server.py index bb1f229b..4b48848c 100644 --- a/tests/test_server.py +++ b/tests/test_server.py @@ -5,7 +5,7 @@ def test_tokencount_valid(): - response = client.post("/api/tokencount", json={"input_text": "Hello world!", "model_id": "openai-community/gpt2"}, headers={"host": "localhost"}) + response = client.post("/tokencount", json={"input_text": "Hello world!", "model_id": "openai-community/gpt2"}, headers={"host": "localhost"}) if response.status_code != 200: print("Response content:", response.content) assert response.status_code == 200 @@ -15,7 +15,7 @@ def test_tokencount_valid(): assert data["token_count"] > 0 def test_tokencount_missing_input(): - response = client.post("/api/tokencount", json={"model_id": "openai-community/gpt2"}, headers={"host": "localhost"}) + response = client.post("/tokencount", json={"model_id": "openai-community/gpt2"}, headers={"host": "localhost"}) if response.status_code != 400: print("Response content:", response.content) assert response.status_code == 400