Skip to content

Commit

Permalink
Merge pull request #1 from YaBoiSkinnyP/improve-project-structure
Browse files Browse the repository at this point in the history
Improve project structure
  • Loading branch information
TheJDen authored Oct 17, 2023
2 parents 6d8c9e7 + 691fc02 commit 53817e3
Show file tree
Hide file tree
Showing 23 changed files with 702 additions and 229,315 deletions.
28 changes: 20 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,31 @@ pip install -U certifi
/Applications/Python 3.X/Install Certificates.command
```
# decryptoai

# word2vec integration
decryptoai is a package that keeps track of my algorithm implementations. It is not deployed to PyPi because the algorithms are still experimental. It may be installed by cloning the repo and pip installing decryptoai in the same directory.


## notebooks/word2vec/word2vec_integration
First look at how word2vec embeddings behave in the Decrypto context

# synthetic datamuse

## notebooks/datamuse
Use the [Datamuse API](https://www.datamuse.com/api/) and asyncio to generate synthetic datasets

# unsupervised guesser proof-of-concept
Demonstrate original approach to interpretable Decrypto Guessing agent

# word2vec unsupervised guessing
Use word2vec embeddings to create heuristics that make our unsupervised guesser prototype competent
## notebooks/proof-of-concept
Demonstrate original approach to interpretable Decrypto Guessing and Intercepting agents


## notebooks/word2vec
Use word2vec embeddings to create heuristics that make our POC prototypes competent


## notebooks/word2vec/word2vec_supervised_guesser.ipynb
Explore more traditional approach to Guesser using neural nets. Has demonstrated that game-ification may inspire attention-based architecture!


# word2vec supervised guessing
Explore more traditional approach to Guesser using neural nets. May lead to proof that game-ification may inspire attention-based architecture.
## notebooks/alternative_guessers.ipynb
Other guessing algorithms I considered that used Competitive Programming techniques like Recursive Backtracking, Dynamic Programming, and enumerating bitsets

Empty file.
14 changes: 14 additions & 0 deletions decryptoai/decryptoai/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import pathlib

parent = pathlib.Path(__file__).resolve().parent.parent.parent # config.py -> decryptoai -> decryptoai -> research dir

data = parent / "data"
models = parent / "models"

GOOGLE_NEWS_PATH_NAME = models / "word2vec-google-news-300_c"

MEANING_JSON_PATH = data / "meaning.json"
TRIGGERWORD_JSON_PATH = data / "trigger_word.json"

MEANING_CSV_PATH = data / "meaning_clues.csv"
TRIGGERWORD_CSV_PATH = data / "triggerword_clues.csv"
Empty file.
Empty file.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Empty file.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,11 @@
import aiohttp
import json
import numpy as np
import os
import random
from itertools import permutations

__author__ = "Jaden Rodriguez"

MEANING_JSON_PATH = "meaning.json"
TRIGGERWORD_JSON_PATH = "trigger_word.json"

MEANING_CSV_PATH = "meaning_clues.csv"
TRIGGERWORD_CSV_PATH = "triggerword_clues.csv"


def datamuse_url(endpoint: str, words: list[str]): # can add stuff for prefix/suffix support later
query_str = '+'.join(words)
Expand All @@ -38,21 +31,23 @@ def create_dataset_dict(responses):
meaning_dataset = {}
for word, response in responses:
response_object = json.loads(response)
meaning_dataset[word] = response_object
if response_object:
meaning_dataset[word] = response_object
return meaning_dataset

async def load_dataset_from_path(path_str, endpoint: str, words):
if not os.path.exists(path_str):

async def load_dataset_from_path(path, endpoint: str, words):
if not path.exists():
if not path.parent.exists():
path.parent.mkdir()
urls = [datamuse_url(endpoint, [word]) for word in words]
responses = await fetch_text_responses(urls, words)

dataset = create_dataset_dict(responses)

with open(path_str, 'w') as f:
with open(str(path), 'w') as f:
json.dump(dataset, f)
else:
with open(path_str) as f:
with open(str(path)) as f:
dataset = json.load(f)
return dataset

Expand All @@ -66,7 +61,7 @@ def filter_illegal_cluewords(legal_clue_func, datamuse_dataset):
def clueword_from_dataset(datamuse_dataset, code_word, seed=400):
candidate_words = []
scores = []
if not datamuse_dataset[code_word]:
if code_word not in datamuse_dataset:
return "garbage"
for word_info in datamuse_dataset[code_word]:
candidate_words.append(word_info["word"])
Expand Down
Empty file.
Original file line number Diff line number Diff line change
@@ -1,21 +1,22 @@
import os
import pathlib
import gensim
import gensim.downloader
import gensim.models
import decryptoai.config as cfg

__author__ = "Jaden Rodriguez"

GOOGLE_NEWS_PATH_NAME = "word2vec-google-news-300_c"

def load_word2vec_keyedvectors(path_str, limit=200_000):
__author__ = "Jaden Rodriguez"


if not os.path.exists(path_str):
def load_word2vec_keyedvectors(*, path: pathlib.Path = cfg.GOOGLE_NEWS_PATH_NAME, limit=200_000, debug=False):
if not path.exists():
if debug:
print(f"{path.resolve()} not found, downloading")
google_news_wv = gensim.downloader.load("word2vec-google-news-300")
google_news_wv.save_word2vec_format(path_str)
google_news_wv.save_word2vec_format(str(path))
del google_news_wv

return gensim.models.KeyedVectors.load_word2vec_format(path_str, limit=limit)
return gensim.models.KeyedVectors.load_word2vec_format(str(path), limit=limit)

def official_keyword_to_word(keyword: str):
typos = { "CALENDA": "calendar"}
Expand Down
10 changes: 10 additions & 0 deletions decryptoai/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
[build-system]
requires = ["setuptools>=61.0"]
build-backend = "setuptools.build_meta"

[project]
name = "decryptoai"
version = "0.0.1"
authors = [
{ name="Jaden Rodriguez", email="[email protected]" },
]
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,11 @@
}
],
"source": [
"# load pretrained word2vec model\n",
"import word2vec_loader as wv_loader\n",
"import decryptoai.word2vec_loader.loader as wv_loader\n",
"\n",
"limit = 200_000\n",
"print(f\"Loading {limit} keys\")\n",
"google_news_wv = wv_loader.load_word2vec_keyedvectors(wv_loader.GOOGLE_NEWS_PATH_NAME, limit)"
"google_news_wv = wv_loader.load_word2vec_keyedvectors(limit=limit, debug=True)"
]
},
{
Expand All @@ -42,9 +41,20 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Loading meaning dataset\n",
"Loading triggerword dataset\n",
"Done!\n"
"Loading meaning dataset\n"
]
},
{
"ename": "FileNotFoundError",
"evalue": "[Errno 2] No such file or directory: '/Users/jadenrodriguez/Projects/decrypto-ai-research/data/meaning.json'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m/Users/jadenrodriguez/Projects/decrypto-ai-research/notebooks/datamuse/synthetic_dataset.ipynb Cell 3\u001b[0m line \u001b[0;36m5\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/jadenrodriguez/Projects/decrypto-ai-research/notebooks/datamuse/synthetic_dataset.ipynb#W2sZmlsZQ%3D%3D?line=54'>55</a>\u001b[0m official_words \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(\u001b[39mmap\u001b[39m(wv_loader\u001b[39m.\u001b[39mofficial_keyword_to_word, dg\u001b[39m.\u001b[39mofficial_words\u001b[39m.\u001b[39menglish\u001b[39m.\u001b[39mwords))\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/jadenrodriguez/Projects/decrypto-ai-research/notebooks/datamuse/synthetic_dataset.ipynb#W2sZmlsZQ%3D%3D?line=56'>57</a>\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39mLoading meaning dataset\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m---> <a href='vscode-notebook-cell:/Users/jadenrodriguez/Projects/decrypto-ai-research/notebooks/datamuse/synthetic_dataset.ipynb#W2sZmlsZQ%3D%3D?line=57'>58</a>\u001b[0m meaning_dataset \u001b[39m=\u001b[39m \u001b[39mawait\u001b[39;00m load_dataset_from_path(meaning_dataset_path, \u001b[39m\"\u001b[39m\u001b[39mwords?ml\u001b[39m\u001b[39m\"\u001b[39m, official_words)\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/jadenrodriguez/Projects/decrypto-ai-research/notebooks/datamuse/synthetic_dataset.ipynb#W2sZmlsZQ%3D%3D?line=59'>60</a>\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39mLoading triggerword dataset\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/jadenrodriguez/Projects/decrypto-ai-research/notebooks/datamuse/synthetic_dataset.ipynb#W2sZmlsZQ%3D%3D?line=60'>61</a>\u001b[0m triggerword_dataset \u001b[39m=\u001b[39m \u001b[39mawait\u001b[39;00m load_dataset_from_path(triggerword_dataset_path, \u001b[39m\"\u001b[39m\u001b[39mwords?rel_trg\u001b[39m\u001b[39m\"\u001b[39m, official_words)\n",
"\u001b[1;32m/Users/jadenrodriguez/Projects/decrypto-ai-research/notebooks/datamuse/synthetic_dataset.ipynb Cell 3\u001b[0m line \u001b[0;36m4\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/jadenrodriguez/Projects/decrypto-ai-research/notebooks/datamuse/synthetic_dataset.ipynb#W2sZmlsZQ%3D%3D?line=39'>40</a>\u001b[0m responses \u001b[39m=\u001b[39m \u001b[39mawait\u001b[39;00m fetch_text_responses(urls, words)\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/jadenrodriguez/Projects/decrypto-ai-research/notebooks/datamuse/synthetic_dataset.ipynb#W2sZmlsZQ%3D%3D?line=41'>42</a>\u001b[0m dataset \u001b[39m=\u001b[39m create_dataset_dict(responses)\n\u001b[0;32m---> <a href='vscode-notebook-cell:/Users/jadenrodriguez/Projects/decrypto-ai-research/notebooks/datamuse/synthetic_dataset.ipynb#W2sZmlsZQ%3D%3D?line=43'>44</a>\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39;49m(\u001b[39mstr\u001b[39;49m(path), \u001b[39m'\u001b[39;49m\u001b[39mw\u001b[39;49m\u001b[39m'\u001b[39;49m) \u001b[39mas\u001b[39;00m f:\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/jadenrodriguez/Projects/decrypto-ai-research/notebooks/datamuse/synthetic_dataset.ipynb#W2sZmlsZQ%3D%3D?line=44'>45</a>\u001b[0m json\u001b[39m.\u001b[39mdump(dataset, f)\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/jadenrodriguez/Projects/decrypto-ai-research/notebooks/datamuse/synthetic_dataset.ipynb#W2sZmlsZQ%3D%3D?line=45'>46</a>\u001b[0m \u001b[39melse\u001b[39;00m:\n",
"File \u001b[0;32m~/Projects/decrypto-ai-research/.venv/lib/python3.11/site-packages/IPython/core/interactiveshell.py:286\u001b[0m, in \u001b[0;36m_modified_open\u001b[0;34m(file, *args, **kwargs)\u001b[0m\n\u001b[1;32m 279\u001b[0m \u001b[39mif\u001b[39;00m file \u001b[39min\u001b[39;00m {\u001b[39m0\u001b[39m, \u001b[39m1\u001b[39m, \u001b[39m2\u001b[39m}:\n\u001b[1;32m 280\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[1;32m 281\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mIPython won\u001b[39m\u001b[39m'\u001b[39m\u001b[39mt let you open fd=\u001b[39m\u001b[39m{\u001b[39;00mfile\u001b[39m}\u001b[39;00m\u001b[39m by default \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 282\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mas it is likely to crash IPython. If you know what you are doing, \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 283\u001b[0m \u001b[39m\"\u001b[39m\u001b[39myou can use builtins\u001b[39m\u001b[39m'\u001b[39m\u001b[39m open.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 284\u001b[0m )\n\u001b[0;32m--> 286\u001b[0m \u001b[39mreturn\u001b[39;00m io_open(file, \u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n",
"\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/Users/jadenrodriguez/Projects/decrypto-ai-research/data/meaning.json'"
]
}
],
Expand All @@ -53,8 +63,9 @@
"import asyncio\n",
"import aiohttp\n",
"import decryptogame as dg\n",
"import decryptoai.config as cfg\n",
"import json\n",
"import os\n",
"import pathlib\n",
"\n",
"def datamuse_url(endpoint: str, words: list[str]): # can add stuff for prefix/suffix support later\n",
" query_str = '+'.join(words)\n",
Expand All @@ -77,30 +88,32 @@
" meaning_dataset = {}\n",
" for word, response in responses:\n",
" response_object = json.loads(response)\n",
" meaning_dataset[word] = response_object\n",
" if response_object:\n",
" meaning_dataset[word] = response_object\n",
" return meaning_dataset\n",
"\n",
"\n",
"# process responses for local storage\n",
"\n",
"async def load_dataset_from_path(path_str, endpoint: str, words):\n",
" if not os.path.exists(path_str):\n",
"\n",
"async def load_dataset_from_path(path: pathlib.Path, endpoint: str, words):\n",
" if not path.exists():\n",
" if not path.parent.exists():\n",
" path.parent.mkdir()\n",
" urls = [datamuse_url(endpoint, [word]) for word in words]\n",
" responses = await fetch_text_responses(urls, words)\n",
"\n",
" dataset = create_dataset_dict(responses)\n",
"\n",
" with open(path_str, 'w') as f:\n",
" with open(str(path), 'w') as f:\n",
" json.dump(dataset, f)\n",
" else:\n",
" with open(path_str) as f:\n",
" with open(str(path)) as f:\n",
" dataset = json.load(f)\n",
" return dataset\n",
"\n",
"\n",
"meaning_dataset_path = \"meaning.json\"\n",
"triggerword_dataset_path = \"trigger_word.json\"\n",
"meaning_dataset_path = cfg.MEANING_JSON_PATH\n",
"triggerword_dataset_path = cfg.TRIGGERWORD_JSON_PATH\n",
"\n",
"official_words = list(map(wv_loader.official_keyword_to_word, dg.official_words.english.words))\n",
"\n",
Expand All @@ -123,7 +136,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -151,7 +164,7 @@
"def clueword_from_dataset(datamuse_dataset, code_word, seed=400):\n",
" candidate_words = []\n",
" scores = []\n",
" if not datamuse_dataset[code_word]:\n",
" if code_word not in datamuse_dataset:\n",
" return \"garbage\"\n",
" for word_info in datamuse_dataset[code_word]:\n",
" candidate_words.append(word_info[\"word\"])\n",
Expand All @@ -165,7 +178,9 @@
" return tuple(clueword_from_dataset(datamuse_dataset, word, seed=seed) for word in codewords)\n",
"\n",
"def legal(keyword, word):\n",
" return (keyword not in word) and (word not in keyword) and word in google_news_wv\n",
" no_inclusion = (keyword not in word) and (word not in keyword)\n",
" no_british = word not in [\"armour\", \"moustache\", \"theatre\", \"mustache\", \"armor\", \"theater\"]\n",
" return no_inclusion and no_british and word in google_news_wv \n",
"\n",
"def codewords(keyword_card, code):\n",
" return [wv_loader.official_keyword_to_word(keyword_card[i]) for i in code]\n",
Expand Down Expand Up @@ -199,7 +214,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -322,10 +337,10 @@
" return list(permutations(range(keyword_card_length), clue_length))\n",
"\n",
"\n",
"meaning_csv = \"meaning_clues.csv\"\n",
"triggerword_csv = \"triggerword_clues.csv\"\n",
"meaning_csv_path = cfg.MEANING_CSV_PATH\n",
"triggerword_csv_path = cfg.TRIGGERWORD_CSV_PATH\n",
"\n",
"if not os.path.exists(meaning_csv) or not os.path.exists(triggerword_csv):\n",
"if not meaning_csv_path.exists() or not triggerword_csv_path.exists():\n",
"\n",
" num_keyword_cards = 1500\n",
" codes = all_possible_codes()\n",
Expand All @@ -345,13 +360,13 @@
" meaning_df = pandas.DataFrame(meaning_data, columns=header)\n",
" triggerword_df = pandas.DataFrame(triggerword_data, columns=header)\n",
" \n",
" meaning_df.to_csv(meaning_csv, index=False)\n",
" triggerword_df.to_csv(triggerword_csv, index=False)\n",
" meaning_df.to_csv(str(meaning_csv_path), index=False)\n",
" triggerword_df.to_csv(str(triggerword_csv_path), index=False)\n",
"\n",
"else:\n",
" meaning_df = pandas.read_csv(meaning_csv)\n",
" meaning_df = pandas.read_csv(str(meaning_csv_path))\n",
"\n",
"meaning_df.head()"
"meaning_df.sample(frac=1).head()"
]
},
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
},
{
"cell_type": "code",
"execution_count": 26,
"execution_count": 1,
"metadata": {},
"outputs": [
{
Expand All @@ -37,7 +37,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/5r/fmvyqtcd33111tp2chr8wmy40000gn/T/ipykernel_65985/711957956.py:21: RuntimeWarning: divide by zero encountered in log\n",
"/var/folders/5r/fmvyqtcd33111tp2chr8wmy40000gn/T/ipykernel_13522/711957956.py:21: RuntimeWarning: divide by zero encountered in log\n",
" log_probabilities = np.log(np.zeros(len(word_index)))\n"
]
}
Expand Down Expand Up @@ -90,7 +90,7 @@
},
{
"cell_type": "code",
"execution_count": 27,
"execution_count": 2,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -156,7 +156,7 @@
},
{
"cell_type": "code",
"execution_count": 28,
"execution_count": 3,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -229,7 +229,7 @@
},
{
"cell_type": "code",
"execution_count": 31,
"execution_count": 4,
"metadata": {},
"outputs": [
{
Expand Down
Loading

0 comments on commit 53817e3

Please sign in to comment.