Skip to content

Commit

Permalink
updates
Browse files Browse the repository at this point in the history
  • Loading branch information
jph00 committed Sep 2, 2024
1 parent e9ab97e commit 4a64f6b
Show file tree
Hide file tree
Showing 9 changed files with 725 additions and 172 deletions.
1 change: 1 addition & 0 deletions llms_txt/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
__version__ = "0.0.2"
from .core import *
13 changes: 8 additions & 5 deletions llms_txt/_modidx.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,14 @@
'lib_path': 'llms_txt'},
'syms': { 'llms_txt.core': { 'llms_txt.core.Doc': ('core.html#doc', 'llms_txt/core.py'),
'llms_txt.core.Section': ('core.html#section', 'llms_txt/core.py'),
'llms_txt.core._opt_re': ('core.html#_opt_re', 'llms_txt/core.py'),
'llms_txt.core._parse_llms_txt': ('core.html#_parse_llms_txt', 'llms_txt/core.py'),
'llms_txt.core._parse_section': ('core.html#_parse_section', 'llms_txt/core.py'),
'llms_txt.core._split_on_h2': ('core.html#_split_on_h2', 'llms_txt/core.py'),
'llms_txt.core._parse_links': ('core.html#_parse_links', 'llms_txt/core.py'),
'llms_txt.core._parse_llms': ('core.html#_parse_llms', 'llms_txt/core.py'),
'llms_txt.core.create_ctx': ('core.html#create_ctx', 'llms_txt/core.py'),
'llms_txt.core.get_sizes': ('core.html#get_sizes', 'llms_txt/core.py'),
'llms_txt.core.llms_txt2ctx': ('core.html#llms_txt2ctx', 'llms_txt/core.py'),
'llms_txt.core.mk_ctx': ('core.html#mk_ctx', 'llms_txt/core.py'),
'llms_txt.core.parse_llms_file': ('core.html#parse_llms_file', 'llms_txt/core.py')}}}
'llms_txt.core.named_re': ('core.html#named_re', 'llms_txt/core.py'),
'llms_txt.core.opt_re': ('core.html#opt_re', 'llms_txt/core.py'),
'llms_txt.core.parse_link': ('core.html#parse_link', 'llms_txt/core.py'),
'llms_txt.core.parse_llms_file': ('core.html#parse_llms_file', 'llms_txt/core.py'),
'llms_txt.core.search': ('core.html#search', 'llms_txt/core.py')}}}
81 changes: 51 additions & 30 deletions llms_txt/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/01_core.ipynb.

# %% auto 0
__all__ = ['Sections', 'Project', 'parse_llms_file', 'Doc', 'Section', 'mk_ctx', 'get_sizes', 'llms_txt2ctx']
__all__ = ['Sections', 'Project', 'opt_re', 'named_re', 'search', 'parse_link', 'parse_llms_file', 'Doc', 'Section', 'mk_ctx',
'get_sizes', 'create_ctx', 'llms_txt2ctx']

# %% ../nbs/01_core.ipynb
import re
Expand All @@ -15,36 +16,51 @@
import httpx

# %% ../nbs/01_core.ipynb
def _opt_re(s): return f'(?:{s})?'
def opt_re(s):
"Pattern to optionally match `s`"
return f'(?:{s})?'

def _parse_llms_txt(txt):
pat = r"^#\s*(?P<title>[^\n]+)\n+"
pat += _opt_re(r"^>\s*(?P<summary>.+?)\n+")
pat += r"(?P<rest>.*)"
match = re.search(pat, txt, flags=(re.DOTALL | re.MULTILINE))
return match.groupdict() if match else None
def named_re(nm, pat):
"Pattern to match `pat` in a named capture group"
return f'(?P<{nm}>{pat})'

def search(pat, txt, flags=0):
"Dictionary of matched groups in `pat` within `txt`"
return re.search(pat, txt, flags=flags).groupdict()

# %% ../nbs/01_core.ipynb
def parse_link(txt):
"Parse a link section from llms.txt"
title = named_re('title', r'[^\]]+')
url = named_re('url', r'[^\)]+')
desc = named_re('desc', r'.*')
desc_pat = opt_re(fr":\s*{desc}")
pat = fr'-\s*\[{title}\]\({url}\){desc_pat}'
return re.search(pat, txt).groupdict()

# %% ../nbs/01_core.ipynb
def _split_on_h2(text):
parts = re.split(r'\n?## ', text)
details = parts[0].strip() if parts[0].strip() else None
sections = [f"## {p.strip()}" for p in parts[1:] if p.strip()]
return details, sections
def _parse_links(links):
return [parse_link(l) for l in re.split(r'\n+', links.strip()) if l.strip()]

# %% ../nbs/01_core.ipynb
def _parse_section(section):
title = section.split('\n', 1)[0].strip('# ')
links = re.findall(r'\[(.+?)\]\((.+?)\)(?:: (.+?))?(?=\n|$)', section)
return title, [(t, u, d.strip() if d else None) for t, u, d in links]
def _parse_llms(txt):
start,*rest = re.split(fr'^##\s*(.*?$)', txt, flags=re.MULTILINE)
d = dict(chunked(rest, 2))
sects = {k: _parse_links(v) for k,v in d.items()}
return start.strip(),sects

# %% ../nbs/01_core.ipynb
def parse_llms_file(txt):
parsed = _parse_llms_txt(txt)
if not parsed: return None
parsed['details'], sections = _split_on_h2(parsed['rest'])
parsed['sections'] = dict(_parse_section(s) for s in sections)
del parsed['rest']
return dict2obj(parsed)
"Parse llms.txt file contents in `txt` to an `AttrDict`"
start,sects = _parse_llms(txt)
title = named_re('title', r'.+?$')
summ = named_re('summary', '.+?$')
summ_pat = opt_re(fr"^>\s*{summ}$")
info = named_re('info', '.*')
pat = fr'^#\s*{title}\n+{summ_pat}\n+{info}'
d = search(pat, start, (re.MULTILINE|re.DOTALL))
d['sections'] = sects
return dict2obj(d)

# %% ../nbs/01_core.ipynb
Sections = partial(ft, 'sections')
Expand All @@ -60,27 +76,32 @@ def Doc(url, **kw):
# %% ../nbs/01_core.ipynb
def Section(nm, items):
"Create a `Section` FT object containing a `Doc` object for each child."
return ft(nm, *[Doc(title=title, url=url, detl=detl) for title,url,detl in items])
return ft(nm, *[Doc(**o) for o in items])

# %% ../nbs/01_core.ipynb
def mk_ctx(d, optional=True):
"Create a `Project` with a `Section` for each H2 part in `d`, optionally skipping the 'optional' section."
skip = '' if optional else 'Optional'
sections = [Section(k, v) for k,v in d.sections.items() if k!=skip]
return Project(title=d.title, summary=d.summary, details=d.details)(*sections)
return Project(title=d.title, summary=d.summary)(d.info, *sections)

# %% ../nbs/01_core.ipynb
def get_sizes(ctx):
"Get the size of each section of the LLM context"
return {o.tag:{p.title:len(p.children[0]) for p in o.children} for o in ctx.children}
return {o.tag:{p.title:len(p.children[0]) for p in o.children} for o in ctx.children if hasattr(o,'tag')}

# %% ../nbs/01_core.ipynb
def create_ctx(txt, optional=False):
"A `Project` with a `Section` for each H2 part in `txt`, optionally skipping the 'optional' section."
d = parse_llms_file(txt)
ctx = mk_ctx(d, optional=optional)
return to_xml(ctx, do_escape=False)

# %% ../nbs/01_core.ipynb
@call_parse
def llms_txt2ctx(
fname:str, # File name to read
optional:bool_arg=True # Skip 'optional' section?
optional:bool_arg=False # Include 'optional' section?
):
"Print a `Project` with a `Section` for each H2 part in file read from `fname`, optionally skipping the 'optional' section."
d = parse_llms_file(Path(fname).read_text())
ctx = mk_ctx(d, optional=optional)
print(to_xml(ctx, do_escape=False))
print(create_ctx(Path(fname).read_text(), optional=optional))
215 changes: 185 additions & 30 deletions nbs/00_intro.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,16 @@
"> Read llms.txt files and create XML context documents for LLMs"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#| hide\n",
"from fastcore.utils import *"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand All @@ -20,6 +30,10 @@
"\n",
"> FastHTML is a python library which...\n",
"\n",
"When writing FastHTML apps remember to:\n",
"\n",
"- Thing to remember\n",
"\n",
"## Docs\n",
"\n",
"- [Surreal](https://host/README.md): Tiny jQuery alternative with Locality of Behavior\n",
Expand Down Expand Up @@ -62,6 +76,15 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"### CLI"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"After installation, `llms_txt2ctx` is available in your terminal.\n",
"\n",
"To get help for the CLI:\n",
"\n",
"```sh\n",
Expand All @@ -74,7 +97,167 @@
"llms_txt2ctx llms.txt > llms.md\n",
"```\n",
"\n",
"Pass `--optional False` to skip the 'optional' section of the input file."
"Pass `--optional True` to add the 'optional' section of the input file."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Python module"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from llms_txt import *"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"samp = Path('llms-sample.txt').read_text()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Use `parse_llms_file` to create a data structure with the sections of an llms.txt file (you can also add `optional=True` if needed):"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['title', 'summary', 'info', 'sections']"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"parsed = parse_llms_file(samp)\n",
"list(parsed)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"('FastHTML',\n",
" 'FastHTML is a python library which brings together Starlette, Uvicorn, HTMX, and fastcore\\'s `FT` \"FastTags\" into a library for creating server-rendered hypermedia applications.')"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"parsed.title,parsed.summary"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Docs', 'Examples', 'Optional']"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"list(parsed.sections)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/markdown": [
"```json\n",
"{ 'desc': 'A subset of the Starlette documentation useful for FastHTML '\n",
" 'development.',\n",
" 'title': 'Starlette full documentation',\n",
" 'url': 'https://gist.githubusercontent.com/jph00/809e4a4808d4510be0e3dc9565e9cbd3/raw/9b717589ca44cedc8aaf00b2b8cacef922964c0f/starlette-sml.md'}\n",
"```"
],
"text/plain": [
"{'title': 'Starlette full documentation',\n",
" 'url': 'https://gist.githubusercontent.com/jph00/809e4a4808d4510be0e3dc9565e9cbd3/raw/9b717589ca44cedc8aaf00b2b8cacef922964c0f/starlette-sml.md',\n",
" 'desc': 'A subset of the Starlette documentation useful for FastHTML development.'}"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"parsed.sections.Optional[0]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Use `create_ctx` to create an LLM context file with XML sections, suitable for systems such as Claude (this is what the CLI calls behind the scenes)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ctx = create_ctx(samp)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<project title=\"FastHTML\" summary='FastHTML is a python library which brings together Starlette, Uvicorn, HTMX, and fastcore&#39;s `FT` \"FastTags\" into a library for creating server-rendered hypermedia applications.'>\n",
"Remember:\n",
"\n",
"- Use `serve()` for running uvicorn (`if __name__ == \"__main__\"` is not\n"
]
}
],
"source": [
"print(ctx[:300])"
]
},
{
Expand All @@ -87,37 +270,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"display_name": "python3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.8"
},
"toc": {
"base_numbering": 1,
"nav_menu": {
"height": "411.818px",
"width": "301.818px"
},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
}
},
"nbformat": 4,
Expand Down
Loading

0 comments on commit 4a64f6b

Please sign in to comment.