Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions docs/docs/providers/file_processor/index.mdx
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
---
sidebar_label: File Processor
title: File_Processor
---

# File_Processor

## Overview

This section contains documentation for all available providers for the **file_processor** API.
17 changes: 17 additions & 0 deletions docs/docs/providers/file_processor/inline_reference.mdx
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
---
description: "Reference file processor implementation (placeholder for development)"
sidebar_label: Reference
title: inline::reference
---

# inline::reference

## Description

Reference file processor implementation (placeholder for development)

## Sample Configuration

```yaml
{}
```
1 change: 1 addition & 0 deletions src/llama_stack/apis/datatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ class Api(Enum, metaclass=DynamicApiMeta):
files = "files"
prompts = "prompts"
conversations = "conversations"
file_processor = "file_processor"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if this should be plural like file_processors? like the APIs above it? This is kind of a nit, but just something to think about!


# built-in API
inspect = "inspect"
Expand Down
7 changes: 7 additions & 0 deletions src/llama_stack/apis/file_processor/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.

from .file_processor import *
96 changes: 96 additions & 0 deletions src/llama_stack/apis/file_processor/file_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.

from typing import Any, Protocol, runtime_checkable

from pydantic import BaseModel

from llama_stack.apis.common.tracing import telemetry_traceable
from llama_stack.apis.vector_io.vector_io import Chunk, VectorStoreChunkingStrategy
from llama_stack.apis.version import LLAMA_STACK_API_V1ALPHA
from llama_stack.schema_utils import json_schema_type, webmethod


@json_schema_type
class ProcessFileRequest(BaseModel):
"""Request for processing a file into structured content."""

file_data: bytes
"""Raw file data to process."""

filename: str
"""Original filename for format detection and processing hints."""

options: dict[str, Any] | None = None
"""Optional processing options. Provider-specific parameters."""

chunking_strategy: VectorStoreChunkingStrategy | None = None
"""Optional chunking strategy for splitting content into chunks."""

include_embeddings: bool = False
"""Whether to generate embeddings for chunks."""


@json_schema_type
class ProcessedContent(BaseModel):
"""Result of file processing operation."""

content: str
"""Extracted text content from the file."""

chunks: list[Chunk] | None = None
"""Optional chunks if chunking strategy was provided."""

embeddings: list[list[float]] | None = None
"""Optional embeddings for chunks if requested."""

metadata: dict[str, Any]
"""Processing metadata including processor name, timing, and provider-specific data."""


@telemetry_traceable
@runtime_checkable
class FileProcessor(Protocol):
"""
File Processor API for converting files into structured, processable content.

This API provides a flexible interface for processing various file formats
(PDFs, documents, images, etc.) into text content that can be used for
vector store ingestion, RAG applications, or standalone content extraction.

The API supports:
- Multiple file formats through extensible provider architecture
- Configurable processing options per provider
- Integration with vector store chunking strategies
- Optional embedding generation for chunks
- Rich metadata about processing results

Future providers can extend this interface to support additional formats,
processing capabilities, and optimization strategies.
"""

@webmethod(route="/file-processor/process", method="POST", level=LLAMA_STACK_API_V1ALPHA)
async def process_file(
self,
file_data: bytes,
filename: str,
options: dict[str, Any] | None = None,
chunking_strategy: VectorStoreChunkingStrategy | None = None,
include_embeddings: bool = False,
) -> ProcessedContent:
"""
Process a file into structured content with optional chunking and embeddings.

This method processes raw file data and converts it into text content for applications such as vector store ingestion.

:param file_data: Raw bytes of the file to process.
:param filename: Original filename for format detection.
:param options: Provider-specific processing options (e.g., OCR settings, output format).
:param chunking_strategy: Optional strategy for splitting content into chunks.
:param include_embeddings: Whether to generate embeddings for chunks.
:returns: ProcessedContent with extracted text, optional chunks, and metadata.
"""
...
2 changes: 2 additions & 0 deletions src/llama_stack/core/resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from llama_stack.apis.datasets import Datasets
from llama_stack.apis.datatypes import ExternalApiSpec
from llama_stack.apis.eval import Eval
from llama_stack.apis.file_processor import FileProcessor
from llama_stack.apis.files import Files
from llama_stack.apis.inference import Inference, InferenceProvider
from llama_stack.apis.inspect import Inspect
Expand Down Expand Up @@ -96,6 +97,7 @@ def api_protocol_map(external_apis: dict[Api, ExternalApiSpec] | None = None) ->
Api.files: Files,
Api.prompts: Prompts,
Api.conversations: Conversations,
Api.file_processor: FileProcessor,
}

if external_apis:
Expand Down
2 changes: 2 additions & 0 deletions src/llama_stack/distributions/ci-tests/build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ distribution_spec:
- provider_type: remote::weaviate
files:
- provider_type: inline::localfs
file_processor:
- provider_type: inline::reference
safety:
- provider_type: inline::llama-guard
- provider_type: inline::code-scanner
Expand Down
4 changes: 4 additions & 0 deletions src/llama_stack/distributions/ci-tests/run.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ apis:
- batches
- datasetio
- eval
- file_processor
- files
- inference
- post_training
Expand Down Expand Up @@ -154,6 +155,9 @@ providers:
metadata_store:
table_name: files_metadata
backend: sql_default
file_processor:
- provider_id: reference
provider_type: inline::reference
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
Expand Down
2 changes: 2 additions & 0 deletions src/llama_stack/distributions/starter-gpu/build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ distribution_spec:
- provider_type: remote::weaviate
files:
- provider_type: inline::localfs
file_processor:
- provider_type: inline::reference
safety:
- provider_type: inline::llama-guard
- provider_type: inline::code-scanner
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ apis:
- batches
- datasetio
- eval
- file_processor
- files
- inference
- post_training
Expand Down Expand Up @@ -154,6 +155,9 @@ providers:
metadata_store:
table_name: files_metadata
backend: sql_default
file_processor:
- provider_id: reference
provider_type: inline::reference
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
Expand Down
4 changes: 4 additions & 0 deletions src/llama_stack/distributions/starter-gpu/run.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ apis:
- batches
- datasetio
- eval
- file_processor
- files
- inference
- post_training
Expand Down Expand Up @@ -154,6 +155,9 @@ providers:
metadata_store:
table_name: files_metadata
backend: sql_default
file_processor:
- provider_id: reference
provider_type: inline::reference
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
Expand Down
2 changes: 2 additions & 0 deletions src/llama_stack/distributions/starter/build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ distribution_spec:
- provider_type: remote::weaviate
files:
- provider_type: inline::localfs
file_processor:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should we have this API in starter? or should we exclude it until it graduated out of alpha / has more providers.

I know post_training is in here, but we had similar issues with that API being in starter due to its startup process/heavy dependencies (torch).

I feel like this API may be similar in that way. What do you think?

- provider_type: inline::reference
safety:
- provider_type: inline::llama-guard
- provider_type: inline::code-scanner
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ apis:
- batches
- datasetio
- eval
- file_processor
- files
- inference
- post_training
Expand Down Expand Up @@ -154,6 +155,9 @@ providers:
metadata_store:
table_name: files_metadata
backend: sql_default
file_processor:
- provider_id: reference
provider_type: inline::reference
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
Expand Down
4 changes: 4 additions & 0 deletions src/llama_stack/distributions/starter/run.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ apis:
- batches
- datasetio
- eval
- file_processor
- files
- inference
- post_training
Expand Down Expand Up @@ -154,6 +155,9 @@ providers:
metadata_store:
table_name: files_metadata
backend: sql_default
file_processor:
- provider_id: reference
provider_type: inline::reference
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
Expand Down
1 change: 1 addition & 0 deletions src/llama_stack/distributions/starter/starter.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ def get_distribution_template(name: str = "starter") -> DistributionTemplate:
BuildProvider(provider_type="remote::weaviate"),
],
"files": [BuildProvider(provider_type="inline::localfs")],
"file_processor": [BuildProvider(provider_type="inline::reference")],
"safety": [
BuildProvider(provider_type="inline::llama-guard"),
BuildProvider(provider_type="inline::code-scanner"),
Expand Down
1 change: 1 addition & 0 deletions src/llama_stack/log.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ class LoggingConfig(BaseModel):
"providers",
"models",
"files",
"file_processor",
"vector_io",
"tool_runtime",
"cli",
Expand Down
5 changes: 5 additions & 0 deletions src/llama_stack/providers/inline/file_processor/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.

from .config import ReferenceFileProcessorImplConfig


async def get_provider_impl(config: ReferenceFileProcessorImplConfig, deps):
from .reference import ReferenceFileProcessorImpl

impl = ReferenceFileProcessorImpl(config, deps)
await impl.initialize()
return impl
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.

from pydantic import BaseModel


class ReferenceFileProcessorImplConfig(BaseModel):
"""Configuration for the reference file processor implementation."""

@staticmethod
def sample_run_config(**kwargs):
return {}
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.

from typing import Any

from llama_stack.apis.file_processor import FileProcessor, ProcessedContent
from llama_stack.apis.vector_io import VectorStoreChunkingStrategy

from .config import ReferenceFileProcessorImplConfig


class ReferenceFileProcessorImpl(FileProcessor):
"""Reference implementation of the FileProcessor API."""

def __init__(self, config: ReferenceFileProcessorImplConfig, deps: dict[str, Any]):
self.config = config
self.deps = deps

async def initialize(self) -> None:
pass

async def process_file(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we need a reference provider if that provider Is a no-op? Instead should we do with this what we did with SDG, where it is just a stub until an actual provider implementation is added? Otherwise this is dead code that someone could put in their run.yaml and get no output from.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 on this. Let's first propose the new API, then add an implementation in another PR. Thanks!

self,
file_data: bytes,
filename: str,
options: dict[str, Any] | None = None,
chunking_strategy: VectorStoreChunkingStrategy | None = None,
include_embeddings: bool = False,
) -> ProcessedContent:
"""Process a file into structured content."""
return ProcessedContent(
content="Placeholder content",
chunks=None,
embeddings=None,
metadata={
"processor": "reference",
"filename": filename,
},
)
20 changes: 20 additions & 0 deletions src/llama_stack/providers/registry/file_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.

from llama_stack.providers.datatypes import Api, InlineProviderSpec, ProviderSpec


def available_providers() -> list[ProviderSpec]:
return [
InlineProviderSpec(
api=Api.file_processor,
provider_type="inline::reference",
pip_packages=[],
module="llama_stack.providers.inline.file_processor.reference",
config_class="llama_stack.providers.inline.file_processor.reference.config.ReferenceFileProcessorImplConfig",
description="Reference file processor implementation (placeholder for development)",
),
]
Loading