Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DONT MERGE]feat(wren-ai-service): generate semantics for alias #976

Open
wants to merge 17 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion deployment/kustomizations/base/cm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ data:
- name: sql_regeneration
llm: litellm_llm.gpt-4o-mini-2024-07-18
engine: wren_ui
- name: semantics_description
- name: semantics_enrichment
llm: litellm_llm.gpt-4o-mini-2024-07-18
- name: relationship_recommendation
llm: litellm_llm.gpt-4o-mini-2024-07-18
Expand Down
2 changes: 1 addition & 1 deletion docker/config.example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ pipes:
- name: sql_regeneration
llm: litellm_llm.gpt-4o-mini-2024-07-18
engine: wren_ui
- name: semantics_description
- name: semantics_enrichment
llm: litellm_llm.gpt-4o-mini-2024-07-18
- name: relationship_recommendation
llm: litellm_llm.gpt-4o-mini-2024-07-18
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ pipes:
- name: sql_regeneration
llm: litellm_llm.gemini/gemini-2.0-flash-exp
engine: wren_ui
- name: semantics_description
- name: semantics_enrichment
llm: litellm_llm.gemini/gemini-2.0-flash-exp
- name: relationship_recommendation
llm: litellm_llm.gemini/gemini-2.0-flash-exp
Expand Down
2 changes: 1 addition & 1 deletion wren-ai-service/docs/config_examples/config.groq.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ pipes:
- name: sql_regeneration
llm: litellm_llm.groq/llama-3.3-70b-specdec
engine: wren_ui
- name: semantics_description
- name: semantics_enrichment
llm: litellm_llm.groq/llama-3.3-70b-specdec
- name: relationship_recommendation
llm: litellm_llm.groq/llama-3.3-70b-specdec
Expand Down
2 changes: 1 addition & 1 deletion wren-ai-service/docs/config_examples/config.ollama.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ pipes:
- name: sql_regeneration
llm: litellm_llm.openai/phi4:14b
engine: wren_ui
- name: semantics_description
- name: semantics_enrichment
llm: litellm_llm.openai/phi4:14b
- name: relationship_recommendation
llm: litellm_llm.openai/phi4:14b
Expand Down
16 changes: 8 additions & 8 deletions wren-ai-service/src/globals.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from src.web.v1.services.chart_adjustment import ChartAdjustmentService
from src.web.v1.services.question_recommendation import QuestionRecommendation
from src.web.v1.services.relationship_recommendation import RelationshipRecommendation
from src.web.v1.services.semantics_description import SemanticsDescription
from src.web.v1.services.semantics_enrichment import SemanticsEnrichment
from src.web.v1.services.semantics_preparation import SemanticsPreparationService
from src.web.v1.services.sql_answer import SqlAnswerService
from src.web.v1.services.sql_expansion import SqlExpansionService
Expand All @@ -29,18 +29,18 @@
class ServiceContainer:
ask_service: AskService
ask_details_service: AskDetailsService
chart_service: ChartService
chart_adjustment_service: ChartAdjustmentService
question_recommendation: QuestionRecommendation
relationship_recommendation: RelationshipRecommendation
semantics_description: SemanticsDescription
semantics_enrichment: SemanticsEnrichment
semantics_preparation_service: SemanticsPreparationService
chart_service: ChartService
chart_adjustment_service: ChartAdjustmentService
sql_answer_service: SqlAnswerService
sql_expansion_service: SqlExpansionService
sql_explanation_service: SqlExplanationService
sql_regeneration_service: SqlRegenerationService
sql_pairs_preparation_service: SqlPairsPreparationService
sql_question_service: SqlQuestionService
sql_regeneration_service: SqlRegenerationService


@dataclass
Expand All @@ -58,10 +58,10 @@ def create_service_container(
"ttl": settings.query_cache_ttl,
}
return ServiceContainer(
semantics_description=SemanticsDescription(
semantics_enrichment=SemanticsEnrichment(
pipelines={
"semantics_description": generation.SemanticsDescription(
**pipe_components["semantics_description"],
"semantics_enrichment": generation.SemanticsEnrichment(
**pipe_components["semantics_enrichment"],
)
},
**query_cache,
Expand Down
2 changes: 1 addition & 1 deletion wren-ai-service/src/pipelines/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def dry_run_pipeline(pipeline_cls: BasicPipeline, pipeline_name: str, **kwargs):
from src.providers import generate_components
from src.utils import init_langfuse, setup_custom_logger

setup_custom_logger("wren-ai-service", level_str=settings.logging_level)
setup_custom_logger("wren-ai-service", level_str=settings.logging_level, is_dev=True)

pipe_components = generate_components(settings.components)
pipeline = pipeline_cls(**pipe_components[pipeline_name])
Expand Down
10 changes: 5 additions & 5 deletions wren-ai-service/src/pipelines/generation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from .intent_classification import IntentClassification
from .question_recommendation import QuestionRecommendation
from .relationship_recommendation import RelationshipRecommendation
from .semantics_description import SemanticsDescription
from .semantics_enrichment import SemanticsEnrichment
from .sql_answer import SQLAnswer
from .sql_breakdown import SQLBreakdown
from .sql_correction import SQLCorrection
Expand All @@ -18,22 +18,22 @@
from .sql_summary import SQLSummary

__all__ = [
"SQLRegeneration",
"ChartGeneration",
"ChartAdjustment",
"ChartGeneration",
"DataAssistance",
"FollowUpSQLGeneration",
"IntentClassification",
"QuestionRecommendation",
"RelationshipRecommendation",
"SemanticsDescription",
"SemanticsEnrichment",
"SQLAnswer",
"SQLBreakdown",
"SQLCorrection",
"SQLExpansion",
"SQLExplanation",
"SQLGeneration",
"SQLGenerationReasoning",
"SQLSummary",
"SQLQuestion",
"SQLRegeneration",
"SQLSummary",
]
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

## Start of Pipeline
@observe(capture_input=False)
def picked_models(mdl: dict, selected_models: list[str]) -> list[dict]:
def picked_models(mdl: dict) -> list[dict]:
def relation_filter(column: dict) -> bool:
return "relationship" not in column

Expand All @@ -27,6 +27,7 @@ def column_formatter(columns: list[dict]) -> list[dict]:
"name": column["name"],
"type": column["type"],
"properties": {
"alias": column["properties"].get("displayName", ""),
"description": column["properties"].get("description", ""),
},
}
Expand All @@ -35,19 +36,17 @@ def column_formatter(columns: list[dict]) -> list[dict]:
]

def extract(model: dict) -> dict:
prop = model["properties"]
return {
"name": model["name"],
"columns": column_formatter(model["columns"]),
"properties": {
"description": model["properties"].get("description", ""),
"alias": prop.get("displayName", ""),
"description": prop.get("description", ""),
},
}

return [
extract(model)
for model in mdl.get("models", [])
if model.get("name", "") in selected_models
]
return [extract(model) for model in mdl.get("models", [])]


@observe(capture_input=False)
Expand Down Expand Up @@ -89,95 +88,60 @@ def wrapper(text: str) -> str:


## End of Pipeline
class ModelProperties(BaseModel):
class Properties(BaseModel):
alias: str
description: str


class ModelColumns(BaseModel):
name: str
properties: ModelProperties
properties: Properties


class SemanticModel(BaseModel):
name: str
columns: list[ModelColumns]
properties: ModelProperties
properties: Properties


class SemanticResult(BaseModel):
models: list[SemanticModel]


SEMANTICS_DESCRIPTION_MODEL_KWARGS = {
SEMANTICS_ENRICHMENT_KWARGS = {
"response_format": {
"type": "json_schema",
"json_schema": {
"name": "semantic_description",
"name": "semantics_enrichment",
"schema": SemanticResult.model_json_schema(),
},
}
}

system_prompt = """
I have a data model represented in JSON format, with the following structure:

```
[
{'name': 'model', 'columns': [
{'name': 'column_1', 'type': 'type', 'properties': {}
},
{'name': 'column_2', 'type': 'type', 'properties': {}
},
{'name': 'column_3', 'type': 'type', 'properties': {}
}
], 'properties': {}
}
]
```

Your task is to update this JSON structure by adding a `description` field inside both the `properties` attribute of each `column` and the `model` itself.
Each `description` should be derived from a user-provided input that explains the purpose or context of the `model` and its respective columns.
Follow these steps:
1. **For the `model`**: Prompt the user to provide a brief description of the model's overall purpose or its context. Insert this description in the `properties` field of the `model`.
2. **For each `column`**: Ask the user to describe each column's role or significance. Each column's description should be added under its respective `properties` field in the format: `'description': 'user-provided text'`.
3. Ensure that the output is a well-formatted JSON structure, preserving the input's original format and adding the appropriate `description` fields.

### Output Format:

```
{
"models": [
{
"name": "model",
"columns": [
{
"name": "column_1",
"properties": {
"description": "<description for column_1>"
}
},
{
"name": "column_2",
"properties": {
"description": "<description for column_1>"
}
},
{
"name": "column_3",
"properties": {
"description": "<description for column_1>"
}
}
],
"properties": {
"description": "<description for model>"
}
}
]
}
```

Make sure that the descriptions are concise, informative, and contextually appropriate based on the input provided by the user.
You are a data model expert. Your task is to enrich a JSON data model with descriptive metadata.

Input Format:
[{
'name': 'model',
'columns': [{'name': 'column', 'type': 'type', 'properties': {'alias': 'alias', 'description': 'description'}}],
'properties': {'alias': 'alias', 'description': 'description'}
}]

For each model and column, you will:
1. Add a clear, concise alias that serves as a business-friendly name
2. Add a detailed description explaining its purpose and usage

Guidelines:
- Descriptions should be clear, concise and business-focused
- Aliases should be intuitive and user-friendly
- Use the user's context to inform the descriptions
- Maintain technical accuracy while being accessible to non-technical users
- IMPORTANT: Never modify the model/table and column names in the 'name' field as this will invalidate the data model
- Only update the 'alias' field to provide user-friendly display names
- When the user prompt includes operators to modify names, apply those modifications to the alias field only

Focus on providing business value through clear, accurate descriptions while maintaining JSON structure integrity.
"""

user_prompt_template = """
Expand All @@ -186,17 +150,17 @@ class SemanticResult(BaseModel):
Picked models: {{ picked_models }}
Localization Language: {{ language }}

Please provide a brief description for the model and each column based on the user's prompt.
Please provide a brief description and alias for the model and each column based on the user's prompt.
"""


class SemanticsDescription(BasicPipeline):
class SemanticsEnrichment(BasicPipeline):
def __init__(self, llm_provider: LLMProvider, **_):
self._components = {
"prompt_builder": PromptBuilder(template=user_prompt_template),
"generator": llm_provider.get_generator(
system_prompt=system_prompt,
generation_kwargs=SEMANTICS_DESCRIPTION_MODEL_KWARGS,
generation_kwargs=SEMANTICS_ENRICHMENT_KWARGS,
),
}
self._final = "normalize"
Expand All @@ -209,16 +173,13 @@ def __init__(self, llm_provider: LLMProvider, **_):
async def run(
self,
user_prompt: str,
selected_models: list[str],
mdl: dict,
language: str = "en",
) -> dict:
logger.info("Semantics Description Generation pipeline is running...")
return await self._pipe.execute(
[self._final],
inputs={
"user_prompt": user_prompt,
"selected_models": selected_models,
"mdl": mdl,
"language": language,
**self._components,
Expand All @@ -230,10 +191,9 @@ async def run(
from src.pipelines.common import dry_run_pipeline

dry_run_pipeline(
SemanticsDescription,
"semantics_description",
SemanticsEnrichment,
"semantics_enrichment",
user_prompt="Track student enrollments, grades, and GPA calculations to monitor academic performance and identify areas for student support",
selected_models=[],
mdl={},
language="en",
)
4 changes: 2 additions & 2 deletions wren-ai-service/src/pipelines/indexing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,8 +120,8 @@ async def run(

__all__ = [
"DBSchema",
"TableDescription",
"HistoricalQuestion",
"SqlPairsDeletion",
"SqlPairs",
"SqlPairsDeletion",
"TableDescription",
]
4 changes: 2 additions & 2 deletions wren-ai-service/src/web/v1/routers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
chart_adjustment,
question_recommendation,
relationship_recommendation,
semantics_description,
semantics_enrichment,
semantics_preparation,
sql_answers,
sql_expansions,
Expand All @@ -22,7 +22,7 @@
router.include_router(ask_details.router)
router.include_router(question_recommendation.router)
router.include_router(relationship_recommendation.router)
router.include_router(semantics_description.router)
router.include_router(semantics_enrichment.router)
router.include_router(semantics_preparation.router)
router.include_router(sql_answers.router)
router.include_router(sql_expansions.router)
Expand Down
Loading
Loading