updates

AnswerDotAI · Sep 2, 2024 · 4a64f6b · 4a64f6b
1 parent e9ab97e
commit 4a64f6b
Show file tree

Hide file tree

Showing 9 changed files with 725 additions and 172 deletions.
diff --git a/llms_txt/__init__.py b/llms_txt/__init__.py
@@ -1 +1,2 @@
 __version__ = "0.0.2"
+from .core import *
diff --git a/llms_txt/_modidx.py b/llms_txt/_modidx.py
@@ -7,11 +7,14 @@
                 'lib_path': 'llms_txt'},
   'syms': { 'llms_txt.core': { 'llms_txt.core.Doc': ('core.html#doc', 'llms_txt/core.py'),
                                'llms_txt.core.Section': ('core.html#section', 'llms_txt/core.py'),
-                               'llms_txt.core._opt_re': ('core.html#_opt_re', 'llms_txt/core.py'),
-                               'llms_txt.core._parse_llms_txt': ('core.html#_parse_llms_txt', 'llms_txt/core.py'),
-                               'llms_txt.core._parse_section': ('core.html#_parse_section', 'llms_txt/core.py'),
-                               'llms_txt.core._split_on_h2': ('core.html#_split_on_h2', 'llms_txt/core.py'),
+                               'llms_txt.core._parse_links': ('core.html#_parse_links', 'llms_txt/core.py'),
+                               'llms_txt.core._parse_llms': ('core.html#_parse_llms', 'llms_txt/core.py'),
+                               'llms_txt.core.create_ctx': ('core.html#create_ctx', 'llms_txt/core.py'),
                                'llms_txt.core.get_sizes': ('core.html#get_sizes', 'llms_txt/core.py'),
                                'llms_txt.core.llms_txt2ctx': ('core.html#llms_txt2ctx', 'llms_txt/core.py'),
                                'llms_txt.core.mk_ctx': ('core.html#mk_ctx', 'llms_txt/core.py'),
-                               'llms_txt.core.parse_llms_file': ('core.html#parse_llms_file', 'llms_txt/core.py')}}}
+                               'llms_txt.core.named_re': ('core.html#named_re', 'llms_txt/core.py'),
+                               'llms_txt.core.opt_re': ('core.html#opt_re', 'llms_txt/core.py'),
+                               'llms_txt.core.parse_link': ('core.html#parse_link', 'llms_txt/core.py'),
+                               'llms_txt.core.parse_llms_file': ('core.html#parse_llms_file', 'llms_txt/core.py'),
+                               'llms_txt.core.search': ('core.html#search', 'llms_txt/core.py')}}}
diff --git a/llms_txt/core.py b/llms_txt/core.py
@@ -3,7 +3,8 @@
 # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/01_core.ipynb.
 
 # %% auto 0
-__all__ = ['Sections', 'Project', 'parse_llms_file', 'Doc', 'Section', 'mk_ctx', 'get_sizes', 'llms_txt2ctx']
+__all__ = ['Sections', 'Project', 'opt_re', 'named_re', 'search', 'parse_link', 'parse_llms_file', 'Doc', 'Section', 'mk_ctx',
+           'get_sizes', 'create_ctx', 'llms_txt2ctx']
 
 # %% ../nbs/01_core.ipynb
 import re
@@ -15,36 +16,51 @@
 import httpx
 
 # %% ../nbs/01_core.ipynb
-def _opt_re(s): return f'(?:{s})?'
+def opt_re(s):
+    "Pattern to optionally match `s`"
+    return f'(?:{s})?'
 
-def _parse_llms_txt(txt):
-    pat = r"^#\s*(?P<title>[^\n]+)\n+"
-    pat += _opt_re(r"^>\s*(?P<summary>.+?)\n+")
-    pat += r"(?P<rest>.*)"
-    match = re.search(pat, txt, flags=(re.DOTALL | re.MULTILINE))
-    return match.groupdict() if match else None
+def named_re(nm, pat):
+    "Pattern to match `pat` in a named capture group"
+    return f'(?P<{nm}>{pat})'
+
+def search(pat, txt, flags=0):
+    "Dictionary of matched groups in `pat` within `txt`"
+    return re.search(pat, txt, flags=flags).groupdict()
+
+# %% ../nbs/01_core.ipynb
+def parse_link(txt):
+    "Parse a link section from llms.txt"
+    title = named_re('title', r'[^\]]+')
+    url = named_re('url', r'[^\)]+')
+    desc = named_re('desc', r'.*')
+    desc_pat = opt_re(fr":\s*{desc}")
+    pat = fr'-\s*\[{title}\]\({url}\){desc_pat}'
+    return re.search(pat, txt).groupdict()
 
 # %% ../nbs/01_core.ipynb
-def _split_on_h2(text):
-    parts = re.split(r'\n?## ', text)
-    details = parts[0].strip() if parts[0].strip() else None
-    sections = [f"## {p.strip()}" for p in parts[1:] if p.strip()]
-    return details, sections
+def _parse_links(links):
+    return [parse_link(l) for l in re.split(r'\n+', links.strip()) if l.strip()]
 
 # %% ../nbs/01_core.ipynb
-def _parse_section(section):
-    title = section.split('\n', 1)[0].strip('# ')
-    links = re.findall(r'\[(.+?)\]\((.+?)\)(?:: (.+?))?(?=\n|$)', section)
-    return title, [(t, u, d.strip() if d else None) for t, u, d in links]
+def _parse_llms(txt):
+    start,*rest = re.split(fr'^##\s*(.*?$)', txt, flags=re.MULTILINE)
+    d = dict(chunked(rest, 2))
+    sects = {k: _parse_links(v) for k,v in d.items()}
+    return start.strip(),sects
 
 # %% ../nbs/01_core.ipynb
 def parse_llms_file(txt):
-    parsed = _parse_llms_txt(txt)
-    if not parsed: return None
-    parsed['details'], sections = _split_on_h2(parsed['rest'])
-    parsed['sections'] = dict(_parse_section(s) for s in sections)
-    del parsed['rest']
-    return dict2obj(parsed)
+    "Parse llms.txt file contents in `txt` to an `AttrDict`"
+    start,sects = _parse_llms(txt)
+    title = named_re('title', r'.+?$')
+    summ = named_re('summary', '.+?$')
+    summ_pat = opt_re(fr"^>\s*{summ}$")
+    info = named_re('info', '.*')
+    pat = fr'^#\s*{title}\n+{summ_pat}\n+{info}'
+    d = search(pat, start, (re.MULTILINE|re.DOTALL))
+    d['sections'] = sects
+    return dict2obj(d)
 
 # %% ../nbs/01_core.ipynb
 Sections = partial(ft, 'sections')
@@ -60,27 +76,32 @@ def Doc(url, **kw):
 # %% ../nbs/01_core.ipynb
 def Section(nm, items):
     "Create a `Section` FT object containing a `Doc` object for each child."
-    return ft(nm, *[Doc(title=title, url=url, detl=detl) for title,url,detl in items])
+    return ft(nm, *[Doc(**o) for o in items])
 
 # %% ../nbs/01_core.ipynb
 def mk_ctx(d, optional=True):
     "Create a `Project` with a `Section` for each H2 part in `d`, optionally skipping the 'optional' section."
     skip = '' if optional else 'Optional'
     sections = [Section(k, v) for k,v in d.sections.items() if k!=skip]
-    return Project(title=d.title, summary=d.summary, details=d.details)(*sections)
+    return Project(title=d.title, summary=d.summary)(d.info, *sections)
 
 # %% ../nbs/01_core.ipynb
 def get_sizes(ctx):
     "Get the size of each section of the LLM context"
-    return {o.tag:{p.title:len(p.children[0]) for p in o.children} for o in ctx.children}
+    return {o.tag:{p.title:len(p.children[0]) for p in o.children} for o in ctx.children if hasattr(o,'tag')}
+
+# %% ../nbs/01_core.ipynb
+def create_ctx(txt, optional=False):
+    "A `Project` with a `Section` for each H2 part in `txt`, optionally skipping the 'optional' section."
+    d = parse_llms_file(txt)
+    ctx = mk_ctx(d, optional=optional)
+    return to_xml(ctx, do_escape=False)
 
 # %% ../nbs/01_core.ipynb
 @call_parse
 def llms_txt2ctx(
     fname:str, # File name to read
-    optional:bool_arg=True # Skip 'optional' section?
+    optional:bool_arg=False # Include 'optional' section?
 ):
     "Print a `Project` with a `Section` for each H2 part in file read from `fname`, optionally skipping the 'optional' section."
-    d = parse_llms_file(Path(fname).read_text())
-    ctx = mk_ctx(d, optional=optional)
-    print(to_xml(ctx, do_escape=False))
+    print(create_ctx(Path(fname).read_text(), optional=optional))
diff --git a/nbs/00_intro.ipynb b/nbs/00_intro.ipynb
@@ -9,6 +9,16 @@
     "> Read llms.txt files and create XML context documents for LLMs"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| hide\n",
+    "from fastcore.utils import *"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -20,6 +30,10 @@
     "\n",
     "> FastHTML is a python library which...\n",
     "\n",
+    "When writing FastHTML apps remember to:\n",
+    "\n",
+    "- Thing to remember\n",
+    "\n",
     "## Docs\n",
     "\n",
     "- [Surreal](https://host/README.md): Tiny jQuery alternative with Locality of Behavior\n",
@@ -62,6 +76,15 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "### CLI"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "After installation, `llms_txt2ctx` is available in your terminal.\n",
+    "\n",
     "To get help for the CLI:\n",
     "\n",
     "```sh\n",
@@ -74,7 +97,167 @@
     "llms_txt2ctx llms.txt > llms.md\n",
     "```\n",
     "\n",
-    "Pass `--optional False` to skip the 'optional' section of the input file."
+    "Pass `--optional True` to add the 'optional' section of the input file."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Python module"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llms_txt import *"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "samp = Path('llms-sample.txt').read_text()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Use `parse_llms_file` to create a data structure with the sections of an llms.txt file (you can also add `optional=True` if needed):"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['title', 'summary', 'info', 'sections']"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "parsed = parse_llms_file(samp)\n",
+    "list(parsed)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "('FastHTML',\n",
+       " 'FastHTML is a python library which brings together Starlette, Uvicorn, HTMX, and fastcore\\'s `FT` \"FastTags\" into a library for creating server-rendered hypermedia applications.')"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "parsed.title,parsed.summary"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['Docs', 'Examples', 'Optional']"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "list(parsed.sections)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/markdown": [
+       "```json\n",
+       "{ 'desc': 'A subset of the Starlette documentation useful for FastHTML '\n",
+       "          'development.',\n",
+       "  'title': 'Starlette full documentation',\n",
+       "  'url': 'https://gist.githubusercontent.com/jph00/809e4a4808d4510be0e3dc9565e9cbd3/raw/9b717589ca44cedc8aaf00b2b8cacef922964c0f/starlette-sml.md'}\n",
+       "```"
+      ],
+      "text/plain": [
+       "{'title': 'Starlette full documentation',\n",
+       " 'url': 'https://gist.githubusercontent.com/jph00/809e4a4808d4510be0e3dc9565e9cbd3/raw/9b717589ca44cedc8aaf00b2b8cacef922964c0f/starlette-sml.md',\n",
+       " 'desc': 'A subset of the Starlette documentation useful for FastHTML development.'}"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "parsed.sections.Optional[0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Use `create_ctx` to create an LLM context file with XML sections, suitable for systems such as Claude (this is what the CLI calls behind the scenes)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ctx = create_ctx(samp)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<project title=\"FastHTML\" summary='FastHTML is a python library which brings together Starlette, Uvicorn, HTMX, and fastcore&#39;s `FT` \"FastTags\" into a library for creating server-rendered hypermedia applications.'>\n",
+      "Remember:\n",
+      "\n",
+      "- Use `serve()` for running uvicorn (`if __name__ == \"__main__\"` is not\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(ctx[:300])"
    ]
   },
   {
@@ -87,37 +270,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "python3",
    "language": "python",
    "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.8"
-  },
-  "toc": {
-   "base_numbering": 1,
-   "nav_menu": {
-    "height": "411.818px",
-    "width": "301.818px"
-   },
-   "number_sections": true,
-   "sideBar": true,
-   "skip_h1_title": false,
-   "title_cell": "Table of Contents",
-   "title_sidebar": "Contents",
-   "toc_cell": false,
-   "toc_position": {},
-   "toc_section_display": true,
-   "toc_window_display": false
   }
  },
  "nbformat": 4,