From a21c1fafc5cf60fe5d76d40adba4a9dca70705b1 Mon Sep 17 00:00:00 2001
From: arthurtham <19481191+arthurtham@users.noreply.github.com>
Date: Thu, 14 Nov 2019 17:55:32 -0800
Subject: [PATCH] Added multiple document support

Previous implementation ignored argv[1] after initial run, which led to the lack of support for adding and retrieving term frequency for multiple documents.
---
 26-persistent-tables/tf-26.py | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/26-persistent-tables/tf-26.py b/26-persistent-tables/tf-26.py
index 8ed7224..c59abf4 100755
--- a/26-persistent-tables/tf-26.py
+++ b/26-persistent-tables/tf-26.py
@@ -14,10 +14,11 @@ def create_db_schema(connection):
     c.close()
 
 def load_file_into_database(path_to_file, connection):
-    """ Takes the path to a file and loads the contents into the database """
+    """ Takes the path to a file and loads the contents into the database,
+    then returns the doc_id """
     def _extract_words(path_to_file):
         with open(path_to_file) as f:
-            str_data = f.read()    
+            str_data = f.read()
         pattern = re.compile('[\W_]+')
         word_list = pattern.sub(' ', str_data).lower().split()
         with open('../stop_words.txt') as f:
@@ -50,6 +51,7 @@ def _extract_words(path_to_file):
         word_id += 1
     connection.commit()
     c.close()
+    return doc_id
 
 #
 # Create if it doesn't exist
@@ -57,12 +59,24 @@ def _extract_words(path_to_file):
 if not os.path.isfile('tf.db'):
     with sqlite3.connect('tf.db') as connection:
         create_db_schema(connection)
-        load_file_into_database(sys.argv[1], connection)
+        load_file_into_database(os.path.abspath(sys.argv[1]), connection)
 
 # Now, let's query
 with sqlite3.connect('tf.db') as connection:
     c = connection.cursor()
-    c.execute("SELECT value, COUNT(*) as C FROM words GROUP BY value ORDER BY C DESC")
+    
+    # Determine if we need to generate new words based on the filename provided
+    c.execute("SELECT id FROM documents WHERE name=?", (os.path.abspath(sys.argv[1]),))
+    row = c.fetchone()
+    if row == None:
+        # document ID didn't exist: create words
+        doc_id = load_file_into_database(os.path.abspath(sys.argv[1]), connection)
+    else:
+        # Get the document ID
+        doc_id = row[0]
+    
+    # Get the cached results from the database based on the filename provided
+    c.execute("SELECT value, COUNT(*) as C FROM words WHERE doc_id=? GROUP BY value ORDER BY C DESC", (doc_id,))
     for i in range(25):
         row = c.fetchone()
         if row != None: