-
Notifications
You must be signed in to change notification settings - Fork 0
/
low_level_cornell.py
122 lines (108 loc) · 4.24 KB
/
low_level_cornell.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import os
"""
Logging provides a set of convenience functions for simple logging usage.
These are debug(), info(), warning(), error() and critical().
"""
import logging
import utils
"""
getLogger() returns a reference to a logger instance with the specified name if it is provided, or root if not
"""
log = logging.getLogger("cornell")
DATA_DIR = "./cornell"
SEPARATOR = "+++$+++"
"""
Using this function we open the file from the directory.
We convert each line to string and the 'yield' function is a bit tricky.
It is used in body of a generator function and returns a generator(Iterable).
Each line is split by the specified separator and str.strip removes the leading or
trailing spaces.This is done for each line and the whole thing is converted into
an iterable list and returned using yield.
"""
def iterate_entries(data_dir,text_file):
with open(os.path.join(data_dir,text_file),"rb") as f:
for each in f:
each = str(each,encoding="utf-8",errors="ignore")
yield list(map(str.strip,each.split(SEPARATOR)))
"""
This function gives out the set of movies which has the genre that we
requested.
"""
def get_filtered_set(data_dir,genre_filter):
movies = set()
for each in iterate_entries(data_dir,"movie_titles_metadata.txt"):
m_id = each[0]
m_genres = each[5]
#'.find()' finds first occurence of specified value.
#Returns -1 if not found.
if m_genres.find(genre_filter) != -1:
movies.add(m_id)
return movies
"""
movies_line.txt has line_id,movie_id,speaker_name and movie_dialogue.
We just access the movie dialogue and tokenize it.
Our output is a dictionary for which keys are dialogue_id and values are phrase tokens.
"""
def read_phrases(data_dir,movies=None):
out = {}
for each in iterate_entries(data_dir,"movie_lines.txt"):
dialogue_id = each[0]
m_id = each[2]
dialogue = each[4]
#If it doesn't find the filtered genre and movie id,
#it goes to the top level loop ignoring next lines.
if movies and m_id not in movies:
continue
tokens = utils.tokenize(dialogue)
if tokens:
out[dialogue_id] = tokens
return out
"""
movie_conversations.txt has a list of the line_ids which make a conversation together.
"""
def load_conversations(data_dir,lines,movies=None):
out = []
for each in iterate_entries(data_dir,"movie_conversations.txt"):
m_id = each[2]
convo = each[3]
if movies and m_id not in movies:
continue
#Take them out of list.
dialogue_ids = convo.strip("[]").split(", ")
#Strip them off the quotations and make them into list of individual words.
dialogue_ids = list(map(lambda s: s.strip("'"),dialogue_ids))
#Getting all these lines together in a list.
dialogue = [lines[dialogue_id] for dialogue_id in dialogue_ids if dialogue_id in lines]
if dialogue:
out.append(dialogue)
return out
"""
Data_dir can also be given as an argument, if different.
genre_filter is optional as well.
load_dialogues loads dialogues from cornell dataset.
Returns list of list of list of words.
"""
def load_dialogues(data_dir=DATA_DIR, genre_filter=''):
filter_movie = None
if genre_filter:
filter_movie = get_filtered_set(data_dir,genre_filter)
log.info("Loaded %d movies belonging to %s genre",len(filter_movie),genre_filter)
log.info("Reading and tokenizing phrases...")
lines = read_phrases(data_dir, movies=filter_movie)
log.info("Loaded %d phrases",len(lines))
dialogues = load_conversations(data_dir,lines,filter_movie)
return dialogues
"""
Takes data directory as input and gives out a dictionary of
movie ids mapped to their respective genres.
"""
def read_genres(data_dir):
out = {}
for each in iterate_entries(data_dir, "movie_titles_metadata.txt"):
m_id, m_genres = each[0], each[5]
l_genres = m_genres.strip("[]").split(", ")
l_genres = list(map(lambda s: s.strip("'"), l_genres))
out[m_id] = l_genres
return out
#print(read_genres('./cornell'))
#print(load_dialogues('./cornell','comedy'))