diff --git a/api_server/legco/management/commands/load_hansard_json.py b/api_server/legco/management/commands/load_hansard_json.py index 4eb0f09..8440525 100644 --- a/api_server/legco/management/commands/load_hansard_json.py +++ b/api_server/legco/management/commands/load_hansard_json.py @@ -3,6 +3,7 @@ from django.db import IntegrityError from django.core.management.base import BaseCommand, CommandError from django.core.exceptions import ObjectDoesNotExist +from legco.models import Keyword from legco.models import Meeting, Vote, Motion, Individual, IndividualVote, VoteSummary, Party, MeetingHansard, MeetingSpeech, MeetingPersonel from dateutil.parser import * import os @@ -17,6 +18,9 @@ import sys import json from datetime import date, datetime +from collections import Counter +from textrank4zh import TextRank4Keyword, TextRank4Sentence + class Command(BaseCommand): help = 'Load hansard JSON into database' @@ -31,7 +35,8 @@ def delete_existing_hansard(self, url): hansard.speeches, hansard.members_absent, hansard.public_officers, - hansard.clerks] + hansard.clerks, + hansard.keywords] for q in queries: for o in q.all(): o.delete() @@ -110,7 +115,19 @@ def handle(self, *args, **options): hansard.public_officers.add(p) for c in clerks: hansard.clerks.add(c) + + all_s = "" for s in speeches: + all_s += s.text_ch + "\n" hansard.speeches.add(s) - print("New hansard ID=%d" % hansard.id) + #print("Calculating keyword") + #tr4w = TextRank4Keyword(allow_speech_tags=["nr", "ns", "nz", "nt"]) + #tr4w.analyze(text=all_s, lower=True, window=3) + #for item in tr4w.get_keywords(20, word_min_len=3): + # keyword = item.word + # m, created = Keyword.objects.get_or_create(keyword = keyword) + # m.keyword = keyword + # print(keyword) + # hansard.keywords.add(m) + #print("New hansard ID=%d" % hansard.id) hansard.save() diff --git a/api_server/legco/models.py b/api_server/legco/models.py index a4da559..29053e1 100644 --- a/api_server/legco/models.py +++ b/api_server/legco/models.py @@ -106,6 +106,7 @@ class MeetingHansard(models.Model): key = models.CharField(max_length=128, unique=True) source_url = models.CharField(max_length=2048) speeches = models.ManyToManyField(MeetingSpeech, related_name='hansard') + keywords = models.ManyToManyField(Keyword) members_present = models.ManyToManyField(MeetingPersonel, related_name='present') members_absent = models.ManyToManyField(MeetingPersonel, related_name='absent') public_officers = models.ManyToManyField(MeetingPersonel, related_name='officers') diff --git a/api_server/requirements.txt b/api_server/requirements.txt index c8cb43e..85d5634 100644 --- a/api_server/requirements.txt +++ b/api_server/requirements.txt @@ -11,3 +11,4 @@ python-dateutil==2.6.1 w3lib==1.19.0 jieba==0.39 lxml==4.1.1 +textrank4zh==0.3