cisco-open
diff --git a/‎backend/datasets/common/BillSum.py‎
Lines changed: 43 additions & 0 deletions b/‎backend/datasets/common/BillSum.py‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎backend/datasets/common/CNNDailyMail.py‎
Lines changed: 42 additions & 0 deletions b/‎backend/datasets/common/CNNDailyMail.py‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎backend/datasets/common/Squad.py‎
Lines changed: 104 additions & 0 deletions b/‎backend/datasets/common/Squad.py‎
Lines changed: 104 additions & 0 deletions
diff --git a/‎backend/datasets/common/User.py‎
Lines changed: 104 additions & 0 deletions b/‎backend/datasets/common/User.py‎
Lines changed: 104 additions & 0 deletions
@@ -0,0 +1,43 @@
+
+# Copyright 2022 Cisco Systems, Inc. and its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+
+"""
+BillSum
+====================================================
+This module extends the Dataset Class and is used to load specifically from the 
+BillSum Dataset.
+
+https://huggingface.co/datasets/billsum
+"""
+
+from backend.datasets.interfaces.hugging_face_dataset import HuggingFaceDataset
+
+
+class BillSum(HuggingFaceDataset):
+    functions_supported = ["search","summarization","search_benchmark","search_comparison"]
+
+
+    def __init__(self):
+        super().__init__(
+            dataset_name='billsum',
+            config='1.0.0',
+            class_name='BillSum',
+            document_column='text',
+            summary_column='summary',
+            split='ca_test')
+        self._dataset_type = 'summarization'
@@ -0,0 +1,42 @@
+
+# Copyright 2022 Cisco Systems, Inc. and its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+
+"""
+CNN/DailyMail Dataset
+====================================================
+This module extends the Dataset Class and is used to load specifically from the 
+CNN/DailyMail Dataset.
+
+https://huggingface.co/datasets/cnn_dailymail
+"""
+
+from backend.datasets.interfaces.hugging_face_dataset import HuggingFaceDataset
+
+
+class CNNDailyMail(HuggingFaceDataset):
+    functions_supported = ["search","summarization"]
+
+    def __init__(self):
+        super().__init__(
+            dataset_name='cnn_dailymail',
+            config='3.0.0',
+            class_name='CNNDailyMail',
+            document_column='article',
+            summary_column='highlights',
+            split='validation')
+        self._dataset_type = 'summarization'
@@ -0,0 +1,104 @@
+
+# Copyright 2022 Cisco Systems, Inc. and its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+
+"""
+Squad
+====================================================
+This module extends the HuggingFaceDataset Class and is used to load 
+specifically from the Squad Dataset
+"""
+
+from backend.datasets.interfaces.hugging_face_dataset import HuggingFaceDataset
+
+
+class Squad(HuggingFaceDataset):
+    
+    functions_supported = ["search","summarization","search_benchmark","search_comparison"]
+
+    def __init__(self):
+        super().__init__(
+            dataset_name='squad_v2',
+            config='squad_v2',
+            class_name='Squad',
+            document_column='text',
+            summary_column=None,
+            split=None)
+        self._dataset_type = 'search'
+        self._topic_content = self._create_topic_content()
+
+    ''' Internal helper function to parse through dataset '''
+
+    def _create_topic_content(self):
+        """Returns list of all available topics (442 total)
+
+        :return: List of all the topics
+        :rtype: list
+        """
+        topic_content = {}
+
+        # topic_content: title --> {context : [q/a pairs]}
+
+        for entry in self._dataset['train']:
+
+            title = entry['title']  # is a string
+            context = entry['context']  # is a string
+            question = entry['question']  # is a string
+            answers = entry['answers']['text']  # can be a list
+
+            if title not in topic_content:
+                topic_content[title] = {context: [(question, answers)]}
+
+            elif context not in topic_content[title]:
+                topic_content[title][context] = [(question, answers)]
+
+            else:
+                topic_content[title][context].append((question, answers))
+
+        return topic_content
+
+    
+
+    def _get_topic_titles(self):
+        """Returns entirety of info_dict for a given title
+
+        :return: list of titles
+        :rtype: list
+        """
+        return list(self._topic_content.keys())
+
+    
+
+    def _get_title_info(self, title):
+        """Returns text of a topic for a given title as list 
+
+        :param title: the name of the title
+        :type title: str
+        :return: list of topic for given title
+        :rtype: list
+        """
+        return self._topic_content[title]
+
+   
+
+    def _get_title_story(self, title):
+        story = []
+        for entry in self._topic_content[title]:
+            story.append(entry)
+        return story
+    
+    
@@ -0,0 +1,104 @@
+
+# Copyright 2022 Cisco Systems, Inc. and its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+
+"""
+User
+====================================================
+This module is used to load custom text files uploaded by user(s).
+"""
+
+import os
+
+
+from backend.config import DevelopmentConfig
+
+
+class User():
+
+    functions_supported = ["search","summarization"]
+
+    def __init__(self):
+        self._class_name = 'User'
+        self._dataset_name = 'User'
+
+        self._topic_content = {}
+        self._get_files()
+
+    def _get_files(self):
+        """Get files from the directory
+        """        
+        for path in os.listdir(DevelopmentConfig.FILES_DIR):
+            if os.path.isfile(os.path.join(DevelopmentConfig.FILES_DIR, path)) and path.endswith('.txt'):
+
+                f = open(os.path.join(DevelopmentConfig.FILES_DIR, path), 'r', encoding='utf-8')
+                lines = f.readlines()
+                f.close()
+
+                self._topic_content[path] = lines
+
+    def _update_file(self, file_path):
+        if file_path not in self._topic_content:
+            if os.path.isfile(os.path.join(DevelopmentConfig.FILES_DIR, file_path)) and file_path.endswith('.txt'):
+                f = open(os.path.join(DevelopmentConfig.FILES_DIR, file_path),
+                         'r', encoding='utf-8')
+                lines = f.readlines()
+                f.close()
+                self._topic_content[file_path] = lines
+
+    def _get_class_name(self):
+        return self._class_name
+
+    def _get_dataset_name(self):
+        return self._dataset_name
+
+
+
+    def _get_topic_titles(self):
+        """Returns list of all available topics
+
+        :return: list of topics
+        :rtype: list
+        """        
+        return list(self._topic_content.keys())
+
+   
+
+    def _get_title_info(self, title):
+        """Returns entirety of info_dict for a given title 
+
+        :param title: specify the title 
+        :type title: str
+        :return: returns contents of title
+        :rtype: dict
+        """        
+        return self._topic_content[title]
+
+    
+
+    def _get_title_story(self, title):
+        """Returns text of a topic for a given title as list
+
+        :param title: Specify the title
+        :type title: str
+        :return: returns the list of text of topic for given title
+        :rtype: list
+        """        
+        story = []
+        for entry in self._topic_content[title]:
+            story.append(entry)
+        return story