From 7624a51062c36c59e6119b7282d41c613c6f9852 Mon Sep 17 00:00:00 2001 From: John Date: Thu, 9 Jan 2025 09:09:51 -0800 Subject: [PATCH] push --- .github/workflows/deploy-docs.yml | 63 +++++++++ .gitignore | 1 + txt2dataset/docs/Makefile | 23 ++++ txt2dataset/docs/make.bat | 35 +++++ txt2dataset/docs/source/dataset_builder.rst | 130 ++++++++++++++++++ txt2dataset/docs/source/index.rst | 14 ++ .../__pycache__/__init__.cpython-311.pyc | Bin 0 -> 256 bytes .../dataset_builder.cpython-311.pyc | Bin 0 -> 6556 bytes 8 files changed, 266 insertions(+) create mode 100644 .github/workflows/deploy-docs.yml create mode 100644 .gitignore create mode 100644 txt2dataset/docs/Makefile create mode 100644 txt2dataset/docs/make.bat create mode 100644 txt2dataset/docs/source/dataset_builder.rst create mode 100644 txt2dataset/docs/source/index.rst create mode 100644 txt2dataset/txt2dataset/__pycache__/__init__.cpython-311.pyc create mode 100644 txt2dataset/txt2dataset/__pycache__/dataset_builder.cpython-311.pyc diff --git a/.github/workflows/deploy-docs.yml b/.github/workflows/deploy-docs.yml new file mode 100644 index 0000000..feac949 --- /dev/null +++ b/.github/workflows/deploy-docs.yml @@ -0,0 +1,63 @@ +name: Deploy Sphinx Documentation + +on: + push: + branches: + - main + workflow_dispatch: + +permissions: + contents: write + +jobs: + build-and-deploy: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.x' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install sphinx sphinx_rtd_theme myst-parser + # Add any additional dependencies your docs need + # pip install -r datamule/docs/requirements.txt + + - name: Clean and Build Documentation + run: | + cd txt2dataset/docs + # More aggressive cleaning + rm -rf build/ + rm -rf source/_build/ + rm -rf _build/ + git rm -rf --cached build/ || true + git rm -rf --cached _build/ || true + make clean + make html + ls -la + ls -la build/ || true + + - name: Check Build Directory + run: | + pwd + ls -la datamule/docs/build/html || true + + - name: Deploy to GitHub Pages + uses: peaceiris/actions-gh-pages@v3 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + publish_dir: ./datamule/docs/build/html + force_orphan: true # This ensures a fresh history + enable_jekyll: false + user_name: 'github-actions[bot]' + user_email: 'github-actions[bot]@users.noreply.github.com' + commit_message: 'Deploy Sphinx documentation [skip ci]' + full_commit_message: | + Deploy Sphinx documentation + + Build from ${{ github.sha }} + Triggered by ${{ github.event_name }} \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a66c47a --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*.egg-info \ No newline at end of file diff --git a/txt2dataset/docs/Makefile b/txt2dataset/docs/Makefile new file mode 100644 index 0000000..e72248f --- /dev/null +++ b/txt2dataset/docs/Makefile @@ -0,0 +1,23 @@ +# Minimal makefile for Sphinx documentation +# +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile clean + +# Add clean target +clean: + rm -rf $(BUILDDIR)/* + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file diff --git a/txt2dataset/docs/make.bat b/txt2dataset/docs/make.bat new file mode 100644 index 0000000..dc1312a --- /dev/null +++ b/txt2dataset/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/txt2dataset/docs/source/dataset_builder.rst b/txt2dataset/docs/source/dataset_builder.rst new file mode 100644 index 0000000..eb05fb6 --- /dev/null +++ b/txt2dataset/docs/source/dataset_builder.rst @@ -0,0 +1,130 @@ + +Dataset Builder +============== + +Transforms unstructured text data into structured datasets using Gemini API. You can get a free API Key from `Google AI Studio `_ with a 15 rpm limit. For higher rate limits, you can then setup the Google $300 Free Credit Trial for 90 days. + +Requirements +----------- + +Input CSV must contain ``accession_number`` and ``text`` columns. + +Methods +------- + +set_api_key(api_key) + Sets Google Gemini API key for authentication. + +set_paths(input_path, output_path, failed_path) + Sets input CSV path, output path, and failed records log path. + +set_base_prompt(prompt) + Sets prompt template for Gemini API. + +set_response_schema(schema) + Sets expected JSON schema for validation. + +set_model(model_name) + Sets Gemini model (default: 'gemini-1.5-flash-8b'). + +set_rpm(rpm) + Sets API rate limit (default: 1500). + +set_save_frequency(frequency) + Sets save interval in records (default: 100). + +build() + Processes input CSV and generates dataset. + +Usage +----- + +.. code-block:: python + + from txt2dataset import DatasetBuilder + import os + + builder = DatasetBuilder() + + # Set API key + builder.set_api_key(os.environ["GOOGLE_API_KEY"]) + + # Set required configurations + builder.set_paths( + input_path="data/item502.csv", + output_path="data/bod.csv", + failed_path="data/failed_accessions.txt" + ) + + builder.set_base_prompt("""Extract Director or Principal Officer info to JSON format. + Provide the following information: + - start_date (YYYYMMDD) + - end_date (YYYYMMDD) + - name (First Middle Last) + - title + Return null if info unavailable.""") + + builder.set_response_schema({ + "type": "ARRAY", + "items": { + "type": "OBJECT", + "properties": { + "start_date": {"type": "STRING", "description": "Start date in YYYYMMDD format"}, + "end_date": {"type": "STRING", "description": "End date in YYYYMMDD format"}, + "name": {"type": "STRING", "description": "Full name (First Middle Last)"}, + "title": {"type": "STRING", "description": "Official title/position"} + }, + "required": ["start_date", "end_date", "name", "title"] + } + }) + + # Optional configurations + builder.set_rpm(1500) + builder.set_save_frequency(100) + builder.set_model('gemini-1.5-flash-8b') + + # Build the dataset + builder.build() + +API Key Setup +------------ + +1. Get API Key: + Visit `Google AI Studio `_ to generate your API key. + +2. Set API Key as Environment Variable: + + Windows (Command Prompt): + :: + + setx GOOGLE_API_KEY your-api-key + + Windows (PowerShell): + :: + + [System.Environment]::SetEnvironmentVariable('GOOGLE_API_KEY', 'your-api-key', 'User') + + macOS/Linux (bash): + :: + + echo 'export GOOGLE_API_KEY="your-api-key"' >> ~/.bash_profile + source ~/.bash_profile + + macOS (zsh): + :: + + echo 'export GOOGLE_API_KEY="your-api-key"' >> ~/.zshrc + source ~/.zshrc + + Note: Replace 'your-api-key' with your actual API key. + + +Alternative API Key Setup +----------------------- + +You can also set the API key directly in your Python code, though this is not recommended for production: + +.. code-block:: python + + api_key = "your-api-key" # Replace with your actual API key + builder.set_api_key(api_key) \ No newline at end of file diff --git a/txt2dataset/docs/source/index.rst b/txt2dataset/docs/source/index.rst new file mode 100644 index 0000000..41983f8 --- /dev/null +++ b/txt2dataset/docs/source/index.rst @@ -0,0 +1,14 @@ +======================================= +Welcome to txt2dataset's documentation! +======================================= + +A Python package to convert text into structured datasets. + +Navigation +========== + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + dataset_builder \ No newline at end of file diff --git a/txt2dataset/txt2dataset/__pycache__/__init__.cpython-311.pyc b/txt2dataset/txt2dataset/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c0e3aada86ef4e8bac1a3d2a6bb908f4c104b0f0 GIT binary patch literal 256 zcmZ3^%ge<81odil>Doa0F^B^LOi;#WAs}NqLkdF*V-7l)7_I5@VMnGkTD zRivV=L6J&VRHR$by1Nxk1s{BNrCq5~ySx3g(MWhliiEVPwEYVTl`6jWoICN@<0REe zduQg%eV%(B_nvd^{le>Y6G-ag-;T|D3Hdh`N{L&}EWHbvJ47TR6DK1K{o6)t^v{m4 z^v{iO^v{p*^lzVVj5rw_V~e{cTq7=q*vM-{WUmvEyKf`p0enUpaf{3u65@Zr;&6!3 zIrXwQ5nXPjzf#|&At>G<5~TttV6GFG-v*FF%rG|x&@S=-9ikvQt}`P}(FrA&=mO{# z-2gqJ2j=tY>jolfM3L0~$yi*J=KWO7EZFWn^VU}#r(`l9n(8~_L@Qq0mu|%wEclXO(qw$DxQO0RQF5Pnh_z0)u zJ;-d;mq3y-n(f#Kq6-idkkB^PYvGd7< zB+9WX(s@x*E~&}XdG)ILl4wYK&HQqE_=2I1?$kBi84d%#YB-#3wCbw6)PSp3c>}-< zDK@ueIn5a?wzlKHrP#DF%gxr9)ura9PFmgA3_rnaZNg$r7M>-*Mxv%dp@8b7i=huP zcSr`+2stCErm_hcH%Dcdhs=Et&I5CqFt9K{jW$@}@;VqAV(Cxk$D~9grn{oa#As}6 zQkFtCxenTCtAYsI+T?op=^j|C;bqOFo2mp-(H4Nt$|e9HlSY(*u;BF0*5n*{=k|hg zyXM>unPN?S)+1xrsJT9&fb&sScR{9_)eNpmlc5X?r82|7+?UKSaD^(7JqQEygB& z{h!{v3Yk@XIx0wQ-C}GXlT;bal4V2u&mskvQWB|*c?+>r>(uJ3s&(439S7ldDQn%N zD{7ijXh+Gs58qNJ5OihWAP&HXDfX+V&%Ayg6HavC@15n%(2 zn+ZobMRNUFi zse{P7r`;-Au;nBQ8TL~mde!z)4RGPM;`@}`hxy3HgT8HE#E@;h|HfPaZjY?9}jnciI<8rQ)$@M2#gAJ>yC;5whuZ=AM}V zZx~jur6if|M@Fv&8!pMTci6Dwx-Sw%R|(_8iOGozlKca%zaLPGY5w$^C-=9d+3sDV zX92N8nWTMn2h1=TSCw>~dr+2>a@(@MjU~p~(zg9=LtveD0sU=yI5fuo&q^qjz!|oh< zeHoXnPE8*;^Qat4p-go9WJ-*vlI|M38kLMHJ|)Kzs?I8^9P&`3uk&;d%Xquks?eSdj^9_ZD9<8A#`&zMS z(;{KKeH7fX7rPGKvgaE^1>cUmZ^u)?mpwImYVLADXv54xL(}cpN0)A2$_;28dkYPF zVSHV~?Vj8(=jG4N7wTTh9xV#KCqmm}p)J4ZrTKaQ##<2fXu_Vnum{>dJbv@|9CKrM zb~royw6*<_BVWG{Ut!-*3pK$-Vgs2}!Udr=UpJ87z7JvDz~7F3&HQEPuR}(uAPj24 zU|tw3wsz$Azmcy$j;~Pt{sd`}N6e6+Ui%cnpTPFkgZ`bhv zEwMKm*!SUEYU2p`fPCQkz-<`x)y?^him=OS0B*WeGol7_2-u7SBTSQS9Wl+QrfGx` z=IEShuuC}b4|vuv-tw@Dq1AF)lZrJ)i_H4cP*O#o4T4XmvOR~JWJStUndKT%#t1x? z3l*-w>LhNEuxa>38+{w6BmVN`^ zWA+XigMjFr(0N#))6BH(2A_lamFWhXfg2A~BoHQ6>h}hBAFRs*)53gYo85tjy|U|r z^TiRs8nWxQ?p-<)1}B%ClJ`TEd;kF+`4qH#;OJ&%W~i@w;XW9TDQXms^?m7{YDY(% zzj0dbJ{V6%BXOk+ zSEQ48t3YFa-Ify3^+&{TRJlTriMlNb5eL1PuuE5ga)lloA%>7tor@{4guEA~p)m-J zNn{MJbv7;~bZ1PJAV`{0WIzUSP%@S%g)TO6RCk&89l9qHkK@4#!!zBzd~Xtic!`HAq)Whr2zMUx zA*6&EEvuD=FzJUcbdPIT~x7J|+9`tKf}=d|tn z3N5`_OK*Pv39aR1AvmlBhqK3uwe`17=Yu=6z|JRuU5^919<46~4rqY`dH(^e_F%UE z`{vfWy`LPod%!r%8h0kehG5oH3^Zol-+OECv2*>OIPN-f7xUhpg11NW_AC;YtDkwa zchP29gjFl6xVhttoqyW-S=Z-Xi!5|}hD-K2g6!arcCx-r3wF-GRS53Uf_oMT@9l?} z1>r3Y?tA8%$`1rp_3P(CT7AbNVQc%DqQCWtf9qra*4&l(n6_t>fiXp zzvZ!iOKvpZ^$Tt5;e!8&=0B3B@1lp!;zH^T9t`-5^7EX+%KEkP7z(Y`xq%3cAP9^~ zg>|zh=9&w_7ERdlMCg1hbk3jEx((2YSL9VEc5&#K%dqBdKb>-Ho z9A)BXyRtwQrAIZ$^NVQukpH9fRVR#IVKa%4CdYDZqeI`7?kS^I5dH_!d}z zCD6d~e%Ki$;N~!A0>Oqw6FAKrt*a^;bwM7_4;66l1|C*|3Hlg51gmKUFz83pLms$q zizLK|A`c@c7>3AiB0$qlU4ULcmQ0StrS8&6`-%jYrb1