containers · rhatdan · Dec 9, 2024 · sourcery-ai · Dec 3, 2024 · sourcery-ai
diff --git a/.github/workflows/ci-images.yml b/.github/workflows/ci-images.yml
@@ -15,7 +15,7 @@ on:
 
 jobs:
   build:
-    runs-on: ubuntu-24.04
+    runs-on: ubuntu-latest
     steps:
       - name: Checkout
         uses: actions/checkout@v4

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -8,7 +8,7 @@ on:
 
 jobs:
   bats:
-    runs-on: ubuntu-24.04
+    runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
       - name: install bats
@@ -17,15 +17,15 @@ jobs:
            sudo apt-get update
            sudo apt-get install podman bats bash codespell python3-argcomplete pipx
            make install-requirements
-           pip install tqdm --break-system-packages
+           pip install docling tqdm
 
       - name: run bats
         run: |
            make validate
            make bats
 
   bats-nocontainer:
-    runs-on: ubuntu-24.04
+    runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
       - name: install bats
@@ -37,11 +37,11 @@ jobs:
 
       - name: bats-nocontainer
         run: |
-           pip install tqdm --break-system-packages
+           pip install docling tqdm
            make bats-nocontainer
 
   docker:
-    runs-on: ubuntu-24.04
+    runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
       - name: install bats
@@ -73,11 +73,11 @@ jobs:
       - name: bats-docker
         run: |
            docker info
-           pip install tqdm --break-system-packages
+           pip install docling tqdm
            make bats-docker
 
   macos:
-    runs-on: macos-14
+    runs-on: macos-latest
     steps:
       - uses: actions/checkout@v4
       - name: install golang
@@ -90,8 +90,9 @@ jobs:
         run: |
            make install-requirements
            make validate
-           pipx install .
+           pipx install . docling
            make bats-nocontainer
+	   cat /Users/runner/.local/pipx/logs/cmd_*.log
 
 # FIXME: ci script should be able to run on MAC.
 #      - name: Run ci

diff --git a/.github/workflows/latest.yml b/.github/workflows/latest.yml
@@ -6,7 +6,7 @@ on:
 
 jobs:
   linux:
-    runs-on: ubuntu-24.04
+    runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
       - name: Free Disk Space Linux
@@ -41,7 +41,7 @@ jobs:
         run: make test
 
   macos:
-    runs-on: macos-14
+    runs-on: macos-latest
     steps:
       - uses: actions/checkout@v4
       - name: install golang
@@ -52,7 +52,7 @@ jobs:
         run: make test
 
   build:
-    runs-on: ubuntu-24.04
+    runs-on: ubuntu-latest
     needs: [linux, macos]
     steps:
         - uses: actions/checkout@v4

diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
@@ -6,7 +6,7 @@ on:
 
 jobs:
   linux:
-    runs-on: ubuntu-24.04
+    runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
       - name: Free Disk Space Linux
@@ -36,7 +36,7 @@ jobs:
         run: make test
 
   macos:
-    runs-on: macos-14
+    runs-on: macos-latest
     steps:
       - uses: actions/checkout@v4
       - name: install golang
@@ -47,7 +47,7 @@ jobs:
         run: make test
 
   build:
-    runs-on: ubuntu-24.04
+    runs-on: ubuntu-latest
     needs: [linux, macos]
     steps:
         - uses: actions/checkout@v4

diff --git a/docs/ramalama-info.1.md b/docs/ramalama-info.1.md
@@ -1,7 +1,7 @@
 % ramalama-info 1
 
 ## NAME
-ramalama\-info - Display RamaLama configuration information
+ramalama\-info - display RamaLama configuration information
 
 
 ## SYNOPSIS

diff --git a/docs/ramalama-rag.1.md b/docs/ramalama-rag.1.md
@@ -0,0 +1,35 @@
+% ramalama-rag 1
+
+## NAME
+ramalama\-rag - generate rag (Retrieval Augmented Generation) data from provided documents and convert into an OCI Image
+
+## SYNOPSIS
+**ramalama rag** [options] [path ...] image
+
+## DESCRIPTION
+Generate rag data from provided documents and convert into an OCI Image
+
+positional arguments:
+  path        Files/Directory containing PDF, DOCX, PPTX, XLSX, HTML, AsciiDoc & Markdown formatted files to be processed. Can be specified multiple times.
+  image       OCI Image name to contain processed rag data
+
+
+## OPTIONS
+
+#### **--help**, **-h**
+Print usage message
+
+## EXAMPLES
+
+```
+$ ramalama rag https://arxiv.org/pdf/2408.09869 /tmp/pdf quay.io/rhatdan/myrag
+Fetching 9 files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 68509.50it/s]
+Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
+2024-12-04 13:49:07.372 (  70.927s) [        75AB6740]    doc_normalisation.h:448   WARN| found new `other` type: checkbox-unselected
+```
+
+## SEE ALSO
+**[ramalama(1)](ramalama.1.md)**
+
+## HISTORY
+Dec 2024, Originally compiled by Dan Walsh <[email protected]>
diff --git a/docs/ramalama.1.md b/docs/ramalama.1.md
@@ -130,12 +130,13 @@ The default can be overridden in the ramalama.conf file.
 | ------------------------------------------------- | ---------------------------------------------------------- |
 | [ramalama-containers(1)](ramalama-containers.1.md)| list all RamaLama containers                               |
 | [ramalama-convert(1)](ramalama-convert.1.md)      | convert AI Models from local storage to OCI Image          |
-| [ramalama-info(1)](ramalama-info.1.md)            | Display RamaLama configuration information                 |
+| [ramalama-info(1)](ramalama-info.1.md)            | display RamaLama configuration information                 |
 | [ramalama-list(1)](ramalama-list.1.md)            | list all downloaded AI Models                              |
 | [ramalama-login(1)](ramalama-login.1.md)          | login to remote registry                                   |
 | [ramalama-logout(1)](ramalama-logout.1.md)        | logout from remote registry                                |
 | [ramalama-pull(1)](ramalama-pull.1.md)            | pull AI Models from Model registries to local storage      |
 | [ramalama-push(1)](ramalama-push.1.md)            | push AI Models from local storage to remote registries     |
+| [ramalama-rag(1)](ramalama-rag.1.md)              | generate rag (Retrieval Augmented Generation) data from provided documents and convert into an OCI Image |
 | [ramalama-rm(1)](ramalama-rm.1.md)                | remove AI Models from local storage                        |
 | [ramalama-run(1)](ramalama-run.1.md)              | run specified AI Model as a chatbot                        |
 | [ramalama-serve(1)](ramalama-serve.1.md)          | serve REST API on specified AI Model                       |

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,6 +4,7 @@ version = "0.2.0"
 dependencies = [
   "argcomplete",
   "tqdm",
+  "docling",
 ]
 requires-python = ">= 3.8"
 maintainers = [

diff --git a/ramalama/cli.py b/ramalama/cli.py
@@ -6,6 +6,7 @@
 import subprocess
 import time
 import ramalama.oci
+import ramalama.rag
 
 from ramalama.huggingface import Huggingface
 from ramalama.common import (
@@ -14,6 +15,7 @@
     perror,
     run_cmd,
 )
+
 from ramalama.model import model_types
 from ramalama.oci import OCI
 from ramalama.ollama import Ollama
@@ -229,6 +231,7 @@ def configure_subcommands(parser):
     logout_parser(subparsers)
     pull_parser(subparsers)
     push_parser(subparsers)
+    rag_parser(subparsers)
     rm_parser(subparsers)
     run_parser(subparsers)
     serve_parser(subparsers)
@@ -408,7 +411,7 @@ def list_containers(args):
 
 
 def info_parser(subparsers):
-    parser = subparsers.add_parser("info", help="Display information pertaining to setup of RamaLama.")
+    parser = subparsers.add_parser("info", help="display information pertaining to setup of RamaLama.")
     parser.add_argument("--container", default=False, action="store_false", help=argparse.SUPPRESS)
     parser.set_defaults(func=info_cli)
 
@@ -771,6 +774,26 @@ def version_parser(subparsers):
     parser.set_defaults(func=print_version)
 
 
+def rag_parser(subparsers):
+    parser = subparsers.add_parser(
+        "rag",
+        help="generate rag (Retrieval Augmented Generation) data from provided documents and convert into an OCI Image",
+    )
+    parser.add_argument(
+        "PATH",
+        nargs="*",
+        help="""\
+Files/Directory containing PDF, DOCX, PPTX, XLSX, HTML, AsciiDoc & Markdown
+formatted files to be processed""",
+    )
+    parser.add_argument("IMAGE", help="OCI Image name to contain processed rag data")
+    parser.set_defaults(func=rag_cli)
+
+
+def rag_cli(args):
+    ramalama.rag.generate(args)
+
+
 def rm_parser(subparsers):
     parser = subparsers.add_parser("rm", help="remove AI Model from local storage")
     parser.add_argument("--container", default=False, action="store_false", help=argparse.SUPPRESS)

diff --git a/ramalama/rag.py b/ramalama/rag.py
@@ -0,0 +1,110 @@
+import tempfile
+import os
+import json
+import logging
+from pathlib import Path
+from typing import Iterable
+
+from ramalama.common import run_cmd
+from docling.datamodel.base_models import ConversionStatus
+from docling.datamodel.document import ConversionResult
+from docling.document_converter import DocumentConverter
+
+_log = logging.getLogger(__name__)
+
+ociimage_rag = "org.containers.type=ai.image.rag"
+
+
+def walk(path):
+    targets = []
+    for root, dirs, files in os.walk(path, topdown=True):
+        if len(files) == 0:
+            continue
+        for f in files:
+            file = os.path.join(root, f)
+            if os.path.isfile(file):
+                targets.append(file)
+    return targets
+
+
+def export_documents(
+    conv_results: Iterable[ConversionResult],
+    output_dir: Path,
+):
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    success_count = 0
+    failure_count = 0
+    partial_success_count = 0
+
+    for conv_res in conv_results:
+        if conv_res.status == ConversionStatus.SUCCESS:
+            success_count += 1
+            doc_filename = conv_res.input.file.stem
+
+            # Export Docling document format to JSON:
+            with (output_dir / f"{doc_filename}.json").open("w") as fp:
+                fp.write(json.dumps(conv_res.document.export_to_dict()))
+
+        elif conv_res.status == ConversionStatus.PARTIAL_SUCCESS:
+            _log.info(f"Document {conv_res.input.file} was partially converted with the following errors:")
+            for item in conv_res.errors:
+                _log.info(f"\t{item.error_message}")
+            partial_success_count += 1
+        else:
+            _log.info(f"Document {conv_res.input.file} failed to convert.")
+            failure_count += 1
+
+    _log.info(
+        f"Processed {success_count + partial_success_count + failure_count} docs, "
+        f"of which {failure_count} failed "
+        f"and {partial_success_count} were partially converted."
+    )
+    return success_count, partial_success_count, failure_count
+
+
+def build(source, target, args):
+    print(f"Building {target}...")
+    src = os.path.realpath(source)
+    contextdir = os.path.dirname(src)
+    model = os.path.basename(src)
+    containerfile = tempfile.NamedTemporaryFile(prefix='RamaLama_Containerfile_', delete=True)
+    # Open the file for writing.
+    with open(containerfile.name, 'w') as c:
+        c.write(
+            f"""\
+FROM scratch
+COPY {model} /
+LABEL {ociimage_rag}
+"""
+        )
+    imageid = (
+        run_cmd(
+            [args.engine, "build", "-t", target, "--no-cache", "-q", "-f", containerfile.name, contextdir],
+            debug=args.debug,
+        )
+        .stdout.decode("utf-8")
+        .strip()
+    )
+    return imageid
+
+
+def generate(args):
+    tmpdir = tempfile.TemporaryDirectory(prefix="ramalama_", delete=True)
+    targets = []
+    for p in args.PATH:
+        if os.path.isfile(p):
+            targets.append(p)  # Process selected file
+            continue
+        if os.path.isdir(p):
+            targets.extend(walk(p))  # Walk directory and process all files
+            continue
+        targets.append(p)  # WEB?
+
+    converter = DocumentConverter()
+    conv_results = converter.convert_all(targets, raises_on_error=False)
+    success_count, partial_success_count, failure_count = export_documents(conv_results, output_dir=Path(tmpdir.name))
+    if failure_count > 0:
+        raise RuntimeError(f"failed to convert {failure_count} target(s) out of {len(targets)} documents.")
+
+    build(tmpdir.name, args.IMAGE, args)