Skip to content

Commit

Permalink
improved software design
Browse files Browse the repository at this point in the history
  • Loading branch information
yolanda93 committed Apr 5, 2016
1 parent 7e0b7e2 commit e06b909
Show file tree
Hide file tree
Showing 4 changed files with 159 additions and 53 deletions.
4 changes: 4 additions & 0 deletions ir_evaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
class IREvaluator(object):
"""description of class"""


146 changes: 93 additions & 53 deletions information_retrieval_system.py → ir_system.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,32 @@
#!/usr/bin/python
###################################################################################
## @file information_retrieval_system.py
# @brief The information_retrieval_system.py is a basic information retrieval system
# implemented using Python, NLTK and GenSIM.
# @authors Yolanda de la Hoz Simón
# @authors Yolanda de la Hoz Simon
###################################################################################
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from gensim import corpora, models, similarities
from operator import itemgetter
import re
import sys
import abc

###################################################################################
## @class InformationRetrievalSystem
# @brief This class represents the InformationRetrievalSystem, i.e., basic methods
# used to preprocess and rank documents according to user queries.
###################################################################################
class InformationRetrievalSystem():
class IRSystem(object):

#################################################################################
## @brief Constructor
# @details This method initializes the class with the parameters introduced by
# the user and execute the query.
#################################################################################
def __init__(self):
print("constructor")
def __init__(self, corpus, queries):
__metaclass__ = abc.ABCMeta
self.corpus=corpus
self.queries=queries


#################################################################################
Expand All @@ -52,7 +52,7 @@ def create_dictionary(self,docs):
pdocs = [self.preprocess_document(doc) for doc in docs]
dictionary = corpora.Dictionary(pdocs)
dictionary.save('vsm.dict')
return dictionary
return dictionary,pdocs

#################################################################################
## @brief get_keyword_to_id_mapping
Expand All @@ -69,9 +69,8 @@ def get_keyword_to_id_mapping(self,dictionary):
# @param corpus Set of documents to be processed.
# @param dictionary The dictionary with the documents keywords.
#################################################################################
def docs2bows(self,corpus, dictionary):
docs = [self.preprocess_document(d) for d in corpus]
vectors = [dictionary.doc2bow(doc) for doc in docs] # each vector is an histogram of terms of document
def docs2bows(self,corpus, dictionary, pdocs):
vectors = [dictionary.doc2bow(doc) for doc in pdocs]
corpora.MmCorpus.serialize('vsm_docs.mm', vectors) # Save the corpus in the Matrix Market format
return vectors

Expand All @@ -80,65 +79,106 @@ def docs2bows(self,corpus, dictionary):
## @brief create_TF_IDF_model
# @details This method creates a weighted TF_IDF matrix to build the vector.
# @param corpus Set of documents to be processed.
#################################################################################
def create_TF_IDF_model(self,corpus):
dictionary = self.create_dictionary(corpus)
self.docs2bows(corpus, dictionary)
loaded_corpus = corpora.MmCorpus('vsm_docs.mm') # Recover the corpus
tfidf = models.TfidfModel(loaded_corpus)
return tfidf, dictionary
#################################################################################
@abc.abstractmethod
def create_documents_view(self,corpus):
return

#################################################################################
## @brief create_TF_IDF_model
# @details This method creates a weighted TF_IDF matrix to build the vector.
# @param corpus Set of documents to be processed.
#################################################################################
@abc.abstractmethod
def create_query_view(self,query):
return

#################################################################################
## @brief ranking_function
# @details This method initializes the class with the parameters introduced by the user
# and execute the query.
# @param corpus Set of documents to be processed.
# @param q Query, a document with the set of relevance words to the user.
#################################################################################
@abc.abstractmethod
def ranking_function(self,corpus, q):
return

#################################################################################
## @brief launch_query
# @details This method initializes the class with the parameters introduced by the user
# and execute the query.
# @param corpus Set of documents to be processed.
# @param q Query, a document with the set of relevance words to the user.
#################################################################################
#################################################################################
@abc.abstractmethod
def launch_query(self,corpus, q):
tfidf, dictionary = self.create_TF_IDF_model(corpus)
return

class IR_tf_idf(IRSystem):

def __init__(self,corpus,queries):
IRSystem.__init__(self,corpus,queries)
print("\n--------------------------Executing TF IDF information retrieval model--------------------------\n")
# launch queries
for q in queries:
self.ranking_function(corpus,q)

def create_documents_view(self,corpus):
dictionary,pdocs = self.create_dictionary(corpus)
self.docs2bows(corpus, dictionary,pdocs)
loaded_corpus = corpora.MmCorpus('vsm_docs.mm') # Recover the corpus
tfidf = models.TfidfModel(loaded_corpus)
return tfidf, dictionary

def create_query_view(self,query,dictionary):
pq = self.preprocess_document(query)
vq = dictionary.doc2bow(pq)
return vq

def ranking_function(self,corpus, q):
tfidf, dictionary = self.create_documents_view(corpus)
loaded_corpus = corpora.MmCorpus('vsm_docs.mm')
index = similarities.MatrixSimilarity(loaded_corpus, num_features=len(dictionary))
pq = self.preprocess_document(q)
vq = dictionary.doc2bow(pq)
vq=self.create_query_view(q,dictionary)
qtfidf = tfidf[vq]
sim = index[qtfidf] # get similarities between the query and all index documents
ranking = sorted(enumerate(sim), key=itemgetter(1), reverse=True) # Documents most similar to the query are arranged first
for doc, score in ranking:
sim = index[qtfidf]
ranking = sorted(enumerate(sim), key=itemgetter(1), reverse=True)
for doc, score in ranking:
print ("[ Score = " + "%.3f" % round(score, 3) + "] " + corpus[doc]);

class IRBoolean(IRSystem):

#################################################################################
## @brief preprocess_input
# @details This method reads user input and transform it into a list
# @param user_input The input given by the user
#################################################################################
def preprocess_userinput(self,user_input):
if "/" or "\\" in user_input: # the user has provided a file path with a set of texts
try:
list_texts = re.split(".I \d*\n.W\n",open(user_input).read())[1:] # Split text file with the delimiter, erase first delimiter
return list_texts
except IOError:
print query_input + " - No such file or directory"
sys.exit(1)
return user_input # the user has provided a query or a text


#######################################################################################################################
## @brief The main function that enables the user to launch queries
#######################################################################################################################
if __name__ == '__main__':

corpus_input = raw_input("Write a text or enter the corpus path: ")
query_input = raw_input("Write a query or enter a document path with a set of queries: ")
def __init__(self,corpus,queries):
IRSystem.__init__(self,corpus,queries)
print("\n--------------------------Executing Boolean information retrieval model--------------------------\n")
print("Not implemented yet")

def create_documents_view(self,corpus):
"""Not implemented yet"""

def create_query_view(self,query):
"""Not implemented yet"""

def ranking_function(self,corpus, q):
"""Not implemented yet"""


class IR_tf(IRSystem):

def __init__(self,corpus,queries):
IRSystem.__init__(self,corpus,queries)
print("\n--------------------------Executing TF information retrieval model--------------------------\n")
print("Not implemented yet")

def create_model(self,corpus):
"""Not implemented yet"""

ir = InformationRetrievalSystem()
corpus_text=ir.preprocess_userinput(corpus_input)
query_text=ir.preprocess_userinput(query_input)
def create_query_view(self,query):
"""Not implemented yet"""

for q in query_text:
ir.launch_query(corpus_text,q)
def ranking_function(self,corpus, q):
"""Not implemented yet"""



Expand Down
58 changes: 58 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#!/usr/bin/python
import ir_system
import re
import sys

#################################################################################
## @brief preprocess_input
# @details This method reads user input and transform it into a list
# @param user_input The input given by the user
#################################################################################
def preprocess_userinput(user_input):
if "/" or "\\" in user_input: # the user has provided a file path with a set of texts
try:
list_texts = re.split(".I \d*\n.W\n",open(user_input).read())[1:] # Split text file with the delimiter, erase first delimiter
return list_texts
except IOError:
print user_input + " - No such file or directory"
sys.exit(0)
return user_input # the user has provided a query or a text

#################################################################################
## @brief create_ir_system
# @details This method creates an information retrieval system with the model
# chosen by the user
# @param irmodel_choice The id of the information retrieval model chosen by the user
#################################################################################
def create_ir_system(irmodel_choice,corpus,query):
if irmodel_choice == 0:
print("dfd")
return ir_system.IRBoolean(corpus,query)
elif irmodel_choice == 1:
return ir_system.IR_tf(corpus,query)
elif irmodel_choice == 2:
return ir_system.IR_tf_idf(corpus,query)


#######################################################################################################################
## @brief The main function that enables the user to launch queries
#######################################################################################################################
if __name__ == '__main__':

print("--------------------------------------------------------\n")
print("------------ Project: Information Retrieval System\n")
print("------------ Course: Data Science Master - Technical University of Madrid\n")
print("------------ Subject: Information Extraction, Retrieval and Intregation\n")
print("------------ Author: Yolanda de la Hoz Simon\n")
print("--------------------------------------------------------\n")

corpus_input = raw_input("Write a text or enter the corpus path:\n")
corpus_text=preprocess_userinput(corpus_input)
query_input = raw_input("Write a query or enter a document path with a set of queries:\n")
query_text=preprocess_userinput(query_input)

print("\n The available models are: \n 0:Boolean\n 1:TF\n 2:TF-IDF\n \n")
irmodel_choice = raw_input("Please, choose an information retrieval model by entering the id of the model:\n")

ir = create_ir_system(int(irmodel_choice),corpus_text,query_text)

4 changes: 4 additions & 0 deletions rocchio_algorithm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
class RocchioAlgorithm(object):
"""description of class"""


0 comments on commit e06b909

Please sign in to comment.