From 97556f31c550f57a86b1abe1f0fbc6ec9190ccf7 Mon Sep 17 00:00:00 2001 From: Hamel Husain Date: Wed, 11 Sep 2024 16:08:10 -0700 Subject: [PATCH] add files --- llms_txt/core.py | 3 ++- nbs/01_core.ipynb | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/llms_txt/core.py b/llms_txt/core.py index 4cc0358..845b4a1 100644 --- a/llms_txt/core.py +++ b/llms_txt/core.py @@ -70,7 +70,8 @@ def _doc(kw): "Create a `Doc` FT object with the text retrieved from `url` as the child, and `kw` as attrs." url = kw.pop('url') re_comment = re.compile('^$', flags=re.MULTILINE) - txt = [o for o in httpx.get(url).text.splitlines() if not re_comment.search(o)] + re_base64_img = re.compile(r']*src="data:image/[^"]*"[^>]*>') + txt = [o for o in httpx.get(url).text.splitlines() if not re_comment.search(o) and not re_base64_img.search(o)] return Doc('\n'.join(txt), **kw) # %% ../nbs/01_core.ipynb diff --git a/nbs/01_core.ipynb b/nbs/01_core.ipynb index bc5b1e1..be56b40 100644 --- a/nbs/01_core.ipynb +++ b/nbs/01_core.ipynb @@ -684,7 +684,8 @@ " \"Create a `Doc` FT object with the text retrieved from `url` as the child, and `kw` as attrs.\"\n", " url = kw.pop('url')\n", " re_comment = re.compile('^$', flags=re.MULTILINE)\n", - " txt = [o for o in httpx.get(url).text.splitlines() if not re_comment.search(o)]\n", + " re_base64_img = re.compile(r']*src=\"data:image/[^\"]*\"[^>]*>')\n", + " txt = [o for o in httpx.get(url).text.splitlines() if not re_comment.search(o) and not re_base64_img.search(o)]\n", " return Doc('\\n'.join(txt), **kw)" ] },