-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathinverse_index_lab.py
110 lines (91 loc) · 4.23 KB
/
inverse_index_lab.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# version code d345910f07ae
coursera = 1
# Please fill out this stencil and submit using the provided submission script.
import random
import dictutil
## 1: (Task 1) Movie Review
## Task 1
def movie_review(name):
"""
Input: the name of a movie
Output: a string (one of the review options), selected at random using randint
"""
reviews = ["See it!", "A gem!", "Ideological claptrap!", "its awesome", "Don't see it ever", "wow!", "what a joke"]
return reviews[random.randint(0, len(reviews)-1)]
print("Star wars review: ", movie_review("asdf"))
print("Listrange2dict: ", dictutil.listrange2dict(['A', 'B', 'C']))
## 2: (Task 2) Make Inverse Index
def makeInverseIndex(strlist):
"""
Input: a list of documents as strings
Output: a dictionary that maps each word in any document to the set consisting of the
document ids (ie, the index in the strlist) for all documents containing the word.
Distinguish between an occurence of a string (e.g. "use") in the document as a word
(surrounded by spaces), and an occurence of the string as a substring of a word (e.g. "because").
Only the former should be represented in the inverse index.
Feel free to use a loop instead of a comprehension.
Example:
>>> makeInverseIndex(['hello world','hello','hello cat','hellolot of cats']) == {'hello': {0, 1, 2}, 'cat': {2}, 'of': {3}, 'world': {0}, 'cats': {3}, 'hellolot': {3}}
True
"""
invIndex = {}
strlist_enum = list(enumerate(strlist))
for (i, document) in strlist_enum:
doc_split = list(document.split())
for word in doc_split:
if word in invIndex:
invIndex[word].add(i)
else:
invIndex[word] = {i}
return invIndex
def loadFromfile(fileName):
f = open(fileName)
return list(f)
print("InvIndex: ", makeInverseIndex(['hello world','hello','hello cat','hellolot of cats']))
print("File inv index: ", makeInverseIndex(loadFromfile("stories_small.txt")))
## 3: (Task 3) Or Search
def orSearch(inverseIndex, query):
"""
Input: an inverse index, as created by makeInverseIndex, and a list of words to query
Output: the set of document ids that contain _any_ of the specified words
Feel free to use a loop instead of a comprehension.
>>> idx = makeInverseIndex(['Johann Sebastian Bach', 'Johannes Brahms', 'Johann Strauss the Younger', 'Johann Strauss the Elder', ' Johann Christian Bach', 'Carl Philipp Emanuel Bach'])
>>> orSearch(idx, ['Bach','the'])
{0, 2, 3, 4, 5}
>>> orSearch(idx, ['Johann', 'Carl'])
{0, 2, 3, 4, 5}
"""
orDoc_set = set({})
for word in query:
if word in inverseIndex:
doc_set = inverseIndex[word]
for doc_num in doc_set:
orDoc_set.add(doc_num)
return orDoc_set
idx = makeInverseIndex(['Johann Sebastian Bach', 'Johannes Brahms', 'Johann Strauss the Younger', 'Johann Strauss the Elder', ' Johann Christian Bach', 'Carl Philipp Emanuel Bach'])
print("orSearch", orSearch(idx, ['Johann', 'Carl']))
print("orSearch", orSearch(idx, ['Bach', 'the']))
## 4: (Task 4) And Search
def andSearch(inverseIndex, query):
"""
Input: an inverse index, as created by makeInverseIndex, and a list of words to query
Output: the set of all document ids that contain _all_ of the specified words
Feel free to use a loop instead of a comprehension.
>>> idx = makeInverseIndex(['Johann Sebastian Bach', 'Johannes Brahms', 'Johann Strauss the Younger', 'Johann Strauss the Elder', ' Johann Christian Bach', 'Carl Philipp Emanuel Bach'])
>>> andSearch(idx, ['Johann', 'the'])
{2, 3}
>>> andSearch(idx, ['Johann', 'Bach'])
{0, 4}
"""
andDoc_set = set({})
doc_list = []
for word in query:
if word in inverseIndex:
doc_list.append(inverseIndex[word])
andDoc_set = doc_list[1]
for doc_set in doc_list:
andDoc_set = andDoc_set.intersection(doc_set)
return andDoc_set
idx2 = makeInverseIndex(['Johann Sebastian Bach', 'Johannes Brahms', 'Johann Strauss the Younger', 'Johann Strauss the Elder', ' Johann Christian Bach', 'Carl Philipp Emanuel Bach'])
print("And search: ", andSearch(idx2, ['Johann', 'the']))
print("And searc: ", andSearch(idx2, ['Johann', 'Bach']))