Skip to content

Commit

Permalink
Merge pull request #597 from CMU-IDeeL/f23_PrettyMeng
Browse files Browse the repository at this point in the history
rec8
  • Loading branch information
PrettyMeng authored Oct 27, 2023
2 parents 118a8b3 + a10b44a commit 851f616
Show file tree
Hide file tree
Showing 7 changed files with 124,511 additions and 9 deletions.
Binary file not shown.
Binary file not shown.
Binary file not shown.
1 change: 0 additions & 1 deletion F23/document/recitation/Recitation8/language_model.ipynb

This file was deleted.

48 changes: 48 additions & 0 deletions F23/document/recitation/Recitation8/shakespeare_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import argparse
import os
import sys

import numpy as np


def read_corpus():
filename = 't8.shakespeare.txt'
lines = []
with open(filename, 'r') as f:
for pos, line in enumerate(f):
if 243 < pos < 124440:
if len(line.strip()) > 0:
lines.append(line)
corpus = " ".join(lines)
return corpus


def get_charmap(corpus):
chars = list(set(corpus))
chars.sort()
charmap = {c: i for i, c in enumerate(chars)}
return chars, charmap


def map_corpus(corpus, charmap):
return np.array([charmap[c] for c in corpus], dtype=np.int64)


def to_text(line, charset):
return "".join([charset[c] for c in line])


def main(argv):

# Read and process data
corpus = read_corpus()
print("Corpus: {}...{}".format(corpus[:50], corpus[-50:]))
print("Total character count: {}".format(len(corpus)))
chars, charmap = get_charmap(corpus)
charcount = len(chars)
print("Unique character count: {}".format(len(chars)))
array = map_corpus(corpus, charmap)


if __name__ == '__main__':
main(sys.argv[1:])
Loading

0 comments on commit 851f616

Please sign in to comment.