Skip to content

Commit 8577c99

Browse files
Refractor datasets (#35)
* added webex bot * added webex meeting support * removed tokens * added list meetings, summaraize selected meetings * added webex_meetings example * updated dynamic api * working commit dataset refractor,added WebEx Dataset * fixed routes merge error * added requirements for webex bot * updated webexbot example * fixed review changes
1 parent be46d96 commit 8577c99

File tree

17 files changed

+515
-84
lines changed

17 files changed

+515
-84
lines changed

backend/datasets/common/BillSum.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
2+
# Copyright 2022 Cisco Systems, Inc. and its affiliates
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
#
16+
# SPDX-License-Identifier: Apache-2.0
17+
18+
19+
"""
20+
BillSum
21+
====================================================
22+
This module extends the Dataset Class and is used to load specifically from the
23+
BillSum Dataset.
24+
25+
https://huggingface.co/datasets/billsum
26+
"""
27+
28+
from backend.datasets.interfaces.hugging_face_dataset import HuggingFaceDataset
29+
30+
31+
class BillSum(HuggingFaceDataset):
32+
functions_supported = ["search","summarization","search_benchmark","search_comparison"]
33+
34+
35+
def __init__(self):
36+
super().__init__(
37+
dataset_name='billsum',
38+
config='1.0.0',
39+
class_name='BillSum',
40+
document_column='text',
41+
summary_column='summary',
42+
split='ca_test')
43+
self._dataset_type = 'summarization'
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
2+
# Copyright 2022 Cisco Systems, Inc. and its affiliates
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
#
16+
# SPDX-License-Identifier: Apache-2.0
17+
18+
19+
"""
20+
CNN/DailyMail Dataset
21+
====================================================
22+
This module extends the Dataset Class and is used to load specifically from the
23+
CNN/DailyMail Dataset.
24+
25+
https://huggingface.co/datasets/cnn_dailymail
26+
"""
27+
28+
from backend.datasets.interfaces.hugging_face_dataset import HuggingFaceDataset
29+
30+
31+
class CNNDailyMail(HuggingFaceDataset):
32+
functions_supported = ["search","summarization"]
33+
34+
def __init__(self):
35+
super().__init__(
36+
dataset_name='cnn_dailymail',
37+
config='3.0.0',
38+
class_name='CNNDailyMail',
39+
document_column='article',
40+
summary_column='highlights',
41+
split='validation')
42+
self._dataset_type = 'summarization'

backend/datasets/common/Squad.py

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
2+
# Copyright 2022 Cisco Systems, Inc. and its affiliates
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
#
16+
# SPDX-License-Identifier: Apache-2.0
17+
18+
19+
"""
20+
Squad
21+
====================================================
22+
This module extends the HuggingFaceDataset Class and is used to load
23+
specifically from the Squad Dataset
24+
"""
25+
26+
from backend.datasets.interfaces.hugging_face_dataset import HuggingFaceDataset
27+
28+
29+
class Squad(HuggingFaceDataset):
30+
31+
functions_supported = ["search","summarization","search_benchmark","search_comparison"]
32+
33+
def __init__(self):
34+
super().__init__(
35+
dataset_name='squad_v2',
36+
config='squad_v2',
37+
class_name='Squad',
38+
document_column='text',
39+
summary_column=None,
40+
split=None)
41+
self._dataset_type = 'search'
42+
self._topic_content = self._create_topic_content()
43+
44+
''' Internal helper function to parse through dataset '''
45+
46+
def _create_topic_content(self):
47+
"""Returns list of all available topics (442 total)
48+
49+
:return: List of all the topics
50+
:rtype: list
51+
"""
52+
topic_content = {}
53+
54+
# topic_content: title --> {context : [q/a pairs]}
55+
56+
for entry in self._dataset['train']:
57+
58+
title = entry['title'] # is a string
59+
context = entry['context'] # is a string
60+
question = entry['question'] # is a string
61+
answers = entry['answers']['text'] # can be a list
62+
63+
if title not in topic_content:
64+
topic_content[title] = {context: [(question, answers)]}
65+
66+
elif context not in topic_content[title]:
67+
topic_content[title][context] = [(question, answers)]
68+
69+
else:
70+
topic_content[title][context].append((question, answers))
71+
72+
return topic_content
73+
74+
75+
76+
def _get_topic_titles(self):
77+
"""Returns entirety of info_dict for a given title
78+
79+
:return: list of titles
80+
:rtype: list
81+
"""
82+
return list(self._topic_content.keys())
83+
84+
85+
86+
def _get_title_info(self, title):
87+
"""Returns text of a topic for a given title as list
88+
89+
:param title: the name of the title
90+
:type title: str
91+
:return: list of topic for given title
92+
:rtype: list
93+
"""
94+
return self._topic_content[title]
95+
96+
97+
98+
def _get_title_story(self, title):
99+
story = []
100+
for entry in self._topic_content[title]:
101+
story.append(entry)
102+
return story
103+
104+

backend/datasets/common/User.py

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
2+
# Copyright 2022 Cisco Systems, Inc. and its affiliates
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
#
16+
# SPDX-License-Identifier: Apache-2.0
17+
18+
19+
"""
20+
User
21+
====================================================
22+
This module is used to load custom text files uploaded by user(s).
23+
"""
24+
25+
import os
26+
27+
28+
from backend.config import DevelopmentConfig
29+
30+
31+
class User():
32+
33+
functions_supported = ["search","summarization"]
34+
35+
def __init__(self):
36+
self._class_name = 'User'
37+
self._dataset_name = 'User'
38+
39+
self._topic_content = {}
40+
self._get_files()
41+
42+
def _get_files(self):
43+
"""Get files from the directory
44+
"""
45+
for path in os.listdir(DevelopmentConfig.FILES_DIR):
46+
if os.path.isfile(os.path.join(DevelopmentConfig.FILES_DIR, path)) and path.endswith('.txt'):
47+
48+
f = open(os.path.join(DevelopmentConfig.FILES_DIR, path), 'r', encoding='utf-8')
49+
lines = f.readlines()
50+
f.close()
51+
52+
self._topic_content[path] = lines
53+
54+
def _update_file(self, file_path):
55+
if file_path not in self._topic_content:
56+
if os.path.isfile(os.path.join(DevelopmentConfig.FILES_DIR, file_path)) and file_path.endswith('.txt'):
57+
f = open(os.path.join(DevelopmentConfig.FILES_DIR, file_path),
58+
'r', encoding='utf-8')
59+
lines = f.readlines()
60+
f.close()
61+
self._topic_content[file_path] = lines
62+
63+
def _get_class_name(self):
64+
return self._class_name
65+
66+
def _get_dataset_name(self):
67+
return self._dataset_name
68+
69+
70+
71+
def _get_topic_titles(self):
72+
"""Returns list of all available topics
73+
74+
:return: list of topics
75+
:rtype: list
76+
"""
77+
return list(self._topic_content.keys())
78+
79+
80+
81+
def _get_title_info(self, title):
82+
"""Returns entirety of info_dict for a given title
83+
84+
:param title: specify the title
85+
:type title: str
86+
:return: returns contents of title
87+
:rtype: dict
88+
"""
89+
return self._topic_content[title]
90+
91+
92+
93+
def _get_title_story(self, title):
94+
"""Returns text of a topic for a given title as list
95+
96+
:param title: Specify the title
97+
:type title: str
98+
:return: returns the list of text of topic for given title
99+
:rtype: list
100+
"""
101+
story = []
102+
for entry in self._topic_content[title]:
103+
story.append(entry)
104+
return story

0 commit comments

Comments
 (0)