Merge remote-tracking branch 'origin/master'

AshokR · Nov 18, 2016 · c3c63bd · c3c63bd
2 parents a7c1b10 + e84a251
commit c3c63bd
Show file tree

Hide file tree

Showing 2 changed files with 48 additions and 23 deletions.
diff --git a/README.md b/README.md
@@ -4,3 +4,17 @@
 #### [தமிழ் சொல்வகை குறியிடும் கருவி (POS tagger)](https://github.com/AshokR/TamilNLP/wiki/POS-Tagger)
 #### [தமிழ் உரை சுருக்கம் செய்யும் நிரல்](https://github.com/AshokR/TamilNLP/wiki/Text-Summary-Extractor)
 #### [தமிழ் வாக்கிய பிரிப்பான்](https://github.com/AshokR/TamilNLP/wiki/Tamil-Sentence-Splitter)
+#### [அடிக்கடி பயன்படுத்தும் வார்த்தைகள் (Stopwords)](https://github.com/AshokR/TamilNLP/wiki/Stopwords)
+---
+Copyright © 2016 இரா. அசோகன்
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/TextSummaryExtractor.py b/TextSummaryExtractor.py
@@ -2,6 +2,7 @@
 # -*- coding: utf-8 -*-
 from __future__ import division
 import re
+import tamil
 
 # This is a naive text summarization algorithm
 # Created by Shlomi Babluki
@@ -22,24 +23,27 @@ def split_content_to_paragraphs(self, content):
 
     # Caculate the intersection between 2 sentences
     def sentences_intersection(self, sent1, sent2):
-
+        
         # split the sentence into words/tokens
-        s1 = set(sent1.split(" "))
-        s2 = set(sent2.split(" "))
-
+        # s1 = set(sent1.split(" "))
+        # s2 = set(sent2.split(" "))
+        s1 = set(tamil.utf8.get_letters(sent1))
+        s2 = set(tamil.utf8.get_letters(sent2))
+
         # If there is not intersection, just return 0
         # if (len(s1) + len(s2)) == 0:
         if len(s1.intersection(s2)) == 0:
             return 0
 
         # We normalize the result by the average number of words
-        return len(s1.intersection(s2)) / ((len(s1) + len(s2)) / 2)
+        return len(s1.intersection(s2)) / ((len(s1) + len(s2)) / 2.0)
 
     # Format a sentence - remove all non-alphbetic chars from the sentence
     # We'll use the formatted sentence as a key in our sentences dictionary
     def format_sentence(self, sentence):
         # sentence = re.sub(r'\W+', '', sentence)       # [\u0B80-\u0BFF]
-        sentence = re.sub(r'[\u0B80-\u0BFF]', '', sentence)
+        sentence = re.sub(r'\s+', '', sentence)
+        sentence = re.sub(r'\d+','',sentence)
         # print sentence
         return sentence
 
@@ -53,9 +57,15 @@ def get_sentences_ranks(self, content):
 
         # Calculate the intersection of every two sentences
         n = len(sentences)
-        values = [[0 for x in xrange(n)] for x in xrange(n)]
+        values = [[0 for x in range(n)] for x in range(n)]
         for i in range(0, n):
             for j in range(0, n):
+                # Metric for intersection is symmetric so we calculate 1/2 only
+                # For additional metrics see: ngram.Distance module in open-tamil
+                # Ref https://github.com/Ezhil-Language-Foundation/open-tamil/blob/master/ngram/Distance.py
+                if i >= j :
+                    values[i][j] = values[j][i]
+                    continue
                 values[i][j] = self.sentences_intersection(sentences[i], sentences[j])
 
         # Build the sentences dictionary
@@ -67,7 +77,10 @@ def get_sentences_ranks(self, content):
                 if i == j:
                     continue
                 score += values[i][j]
-            sentences_dic[self.format_sentence(sentences[i])] = score
+            kw = self.format_sentence(sentences[i])
+            if len(kw) != 0:
+                sentences_dic[kw] = score
+
         return sentences_dic
 
     # Return the best sentence in a paragraph
@@ -111,16 +124,13 @@ def get_summary(self, title, content, sentences_dic):
 
         return ("\n").join(summary)
 
-
 # Main method, just run "python summary_tool.py"
 def main():
-
-
-    title = """
+    title = u"""
 குத்துச்சண்டை ஜாம்பவான் முகமது அலி மறைவு
     """
-
-    content = """
+    
+    content = u"""
 அமெரிக்காவின் முன்னாள் ஹெவி வெயிட் குத்துச்சண்டை வீரர் முகமது அலி காலமானார். அவருக்கு வயது 74. சுவாசக்கோளாறு காரணமாக முகமது அலி மரணமடைந்ததாக அவரது குடும் பத்தினர் வெளியிட்டுள்ள அறிக்கையில் கூறியுள்ளனர்.
 உலக குத்துச்சண்டை சாம்பியன் பட்டத்தை 3 முறை வென்று சாதனை படைத்தவர் முகமது அலி. அமெரிக்காவின் கென்டகி மாநிலத்தில் 1942-ம் ஆண்டு பிறந்த முகமது அலியின் இயற்பெயர் காசியஸ் க்ளே. தனது 18 வயதில் குத்துசண்டை களத்தில் இறங்கிய முகமது அலி 1960-ல் ஹெவிவெயிட் ஒலிம்பிக் தங்கப் பதக்கத்தை பெற்றார். இதைத்தொடர்ந்து குத்துச்சண்டை என்றாலே முகமது அலி என்று சொல்லும் அளவுக்கு புகழ்பெற்றார். குத்துச்சண்டை களத்தில் மட்டுமின்றி அமெரிக்காவில் அக்காலத்தில் தீவிரமாக பரவியிருந்த இனவெறிக்கு எதிராகவும் அவர் போராடினார். அவர் குவிக்கும் வெற்றிகள் கறுப்பின மக்களிடையே புதிய எழுச்சியை ஏற்படுத்தின.
 1960-ல் இருந்து 1981 வரை முகமது அலி குத்துச்சண்டை உலகின் முடிசூடா மன்னனாக இருந்தார். 61 தொழில்முறை குத்துச்சண்டை போட்டிகளில் 56-ல் வெற்றி பெற்று அனைவரையும் ஆச்சரியத்தில் ஆழ்த்தினார். இதில் 37 போட்டிகளில் நாக் அவுட் முறையில் வென்றதால் ‘நாக் அவுட் நாயகன்’ என்று அழைக்கப்பட்டார். 3 முறை உலக குத்துச்சண்டை சாம்பியன் பட்டத்தை வென்றார்.
@@ -142,14 +152,15 @@ def main():
     summary = st.get_summary(title, content, sentences_dic)
 
     # Print the summary
-    print summary
+    print(summary)
 
     # Print the ratio between the summary length and the original length
-    print ""
-    print "Original Length %s" % (len(title) + len(content))
-    print "Summary Length %s" % len(summary)
-    print "Summary Ratio: %s" % (100 - (100 * (len(summary) / (len(title) + len(content)))))
-
-
-if __name__ == '__main__':
-    main()
+    print(u"")
+    print(u"Original Length %s" % (len(title) + len(content)))
+    print(u"Summary Length %s" % len(summary))
+    print(u"Summary Ratio: %s" % (100 - (100 * (len(summary) / (len(title) + len(content))))))
+    import pprint
+    pprint.pprint(sentences_dic)
+
+if __name__ == u'__main__':
+    main()