From 45223927f3fda515642ab0ec616c61206d47018c Mon Sep 17 00:00:00 2001 From: Fadl Date: Tue, 12 Nov 2024 16:21:36 +0100 Subject: [PATCH 1/5] initial --- obstracts/server/models.py | 1 + obstracts/server/views.py | 15 ++++++++++++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/obstracts/server/models.py b/obstracts/server/models.py index d66e6a1..5e3550a 100644 --- a/obstracts/server/models.py +++ b/obstracts/server/models.py @@ -35,6 +35,7 @@ def upload_to_func(instance: 'File', filename): class File(models.Model): post_id = models.UUIDField(primary_key=True) markdown_file = models.FileField(upload_to=upload_to_func, null=True) + summary = models.CharField(max_length=65535, null=True) class FileImage(models.Model): diff --git a/obstracts/server/views.py b/obstracts/server/views.py index ed9b592..841d4ab 100644 --- a/obstracts/server/views.py +++ b/obstracts/server/views.py @@ -37,6 +37,7 @@ import mistune from mistune.renderers.markdown import MarkdownRenderer from mistune.util import unescape + class MarkdownImageReplacer(MarkdownRenderer): def __init__(self, request, queryset): self.request = request @@ -55,7 +56,10 @@ def codespan(self, token: dict[str, dict], state: mistune.BlockState) -> str: token['raw'] = unescape(token['raw']) return super().codespan(token, state) - + @classmethod + def get_markdown(cls, request, md_text, images_qs: 'models.models.BaseManager[models.FileImage]'): + modify_links = mistune.create_markdown(escape=False, renderer=cls(request, images_qs)) + return modify_links(md_text) @extend_schema_view( list=extend_schema( @@ -446,8 +450,8 @@ def get_post_objects(self, post_id, feed_id): @decorators.action(detail=True, methods=["GET"]) def markdown(self, request, feed_id=None, post_id=None): obj = get_object_or_404(models.File, post_id=post_id) - modify_links = mistune.create_markdown(escape=False, renderer=MarkdownImageReplacer(self.request, models.FileImage.objects.filter(report__post_id=post_id))) - return FileResponse(streaming_content=modify_links(obj.markdown_file.read().decode()), content_type='text/markdown', filename='markdown.md') + resp_text = MarkdownImageReplacer.get_markdown(request, obj.markdown_file.read().decode(), models.FileImage.objects.filter(report__post_id=post_id)) + return FileResponse(streaming_content=resp_text, content_type='text/markdown', filename='markdown.md') @extend_schema( responses={200: serializers.ImageSerializer(many=True), 404: api_schema.DEFAULT_404_ERROR, 400: api_schema.DEFAULT_400_ERROR}, @@ -485,6 +489,11 @@ def remove_report(self, post_id, collection): helper.execute_query(query, bind_vars={"@collection": f"{collection}_{c}", 'post_id': post_id}, paginate=False) + @decorators.action(methods=["GET"], detail=True, serializer_class=JobSerializer) + def summarize(self, request, feed_id=None, post_id=None): + obj = get_object_or_404(models.File, post_id=post_id) + return FileResponse(streaming_content=obj.summary, content_type='text/markdown', filename='summary.md') + @extend_schema_view( list=extend_schema( summary="Search Jobs", From f1546b6c3d12f37079ec7273774c98e9a1ddf268 Mon Sep 17 00:00:00 2001 From: Fadl Date: Wed, 13 Nov 2024 15:17:21 +0100 Subject: [PATCH 2/5] implement summary provider #24 --- docker-compose.yml | 1 - obstracts/cjob/tasks.py | 32 +++++++++++++++++++++----------- obstracts/server/serializers.py | 15 +++++++-------- obstracts/server/views.py | 17 +++++++++++------ obstracts/urls.py | 5 +---- requirements.txt | 4 ++-- 6 files changed, 42 insertions(+), 32 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 838768c..6fa2777 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -22,7 +22,6 @@ services: extends: env_django command: > bash -c " - # python manage.py collectstatic --no-input && python manage.py makemigrations && python manage.py migrate && gunicorn obstracts.wsgi:application --reload --bind 0.0.0.0:8001 diff --git a/obstracts/cjob/tasks.py b/obstracts/cjob/tasks.py index 306b297..4b129d1 100644 --- a/obstracts/cjob/tasks.py +++ b/obstracts/cjob/tasks.py @@ -6,6 +6,7 @@ import typing from dogesec_commons.stixifier.stixifier import StixifyProcessor, ReportProperties +from dogesec_commons.stixifier.summarizer import parse_summarizer_model from ..server.models import Job, FeedProfile from ..server import models @@ -78,27 +79,27 @@ def poll_job(job_id): current_task.retry(max_retries=200) -def new_task(feed_dict, profile_id): +def new_task(feed_dict, profile_id, summary_provider): kwargs = dict(id=feed_dict["feed_id"], profile_id=profile_id) if title := feed_dict.get("title"): kwargs.update(title=title) feed, _ = FeedProfile.objects.update_or_create(defaults=kwargs, id=feed_dict["feed_id"]) job = Job.objects.create(id=feed_dict["job_id"], feed=feed, profile_id=profile_id) - (poll_job.s(job.id) | start_processing.s(job.id)).apply_async( + (poll_job.s(job.id) | start_processing.s(job.id, summary_provider)).apply_async( countdown=5, root_id=job.id, task_id=job.id ) return job -def new_post_patch_task(input_dict, profile_id): +def new_post_patch_task(input_dict, profile_id, summary_provider): job = Job.objects.create(id=input_dict["job_id"], feed_id=input_dict["feed_id"], profile_id=profile_id) - (poll_job.s(job.id) | start_processing.s(job.id)).apply_async( + (poll_job.s(job.id) | start_processing.s(job.id, summary_provider)).apply_async( countdown=5, root_id=job.id, task_id=job.id ) return job @shared_task -def start_processing(h4f_job, job_id): +def start_processing(h4f_job, job_id, summary_provider): job = Job.objects.get(id=job_id) logging.info( f"processing {job_id=}, {job.feed_id=}, {current_task.request.root_id=}" @@ -128,7 +129,7 @@ def start_processing(h4f_job, job_id): ) break logging.info("processing %d posts for job %s", len(posts), job_id) - tasks = [process_post.si(job_id, post) for post in posts] + tasks = [process_post.si(job_id, post, summary_provider) for post in posts] tasks.append(job_completed_with_error.si(job_id)) return chain(tasks).apply_async() @@ -142,13 +143,13 @@ def set_job_completed(job_id): @shared_task -def process_post(job_id, post, *args): +def process_post(job_id, post, summary_provider, *args): job = Job.objects.get(id=job_id) post_id = str(post['id']) try: - file = io.BytesIO(post['description'].encode()) - file.name = f"post-{post_id}.html" - processor = StixifyProcessor(file, job.profile, job_id=job.id, file2txt_mode="html_article", report_id=post_id, base_url=post['link']) + stream = io.BytesIO(post['description'].encode()) + stream.name = f"post-{post_id}.html" + processor = StixifyProcessor(stream, job.profile, job_id=job.id, file2txt_mode="html_article", report_id=post_id, base_url=post['link']) processor.collection_name = job.feed.collection_name properties = ReportProperties( name=post['title'], @@ -165,8 +166,17 @@ def process_post(job_id, post, *args): ) processor.setup(properties, dict(_obstracts_feed_id=str(job.feed.id), _obstracts_post_id=post_id)) processor.process() - file, _ = models.File.objects.get_or_create(post_id=post_id) + if summary_provider: + logging.info(f"summarizing report {processor.report_id} using `{summary_provider}`") + try: + summary_provider = parse_summarizer_model(summary_provider) + file.summary = summary_provider.summarize(processor.output_md) + except BaseException as e: + print(f"got err {e}") + logging.info(f"got err {e}", exc_info=True) + + file.markdown_file.save('markdown.md', processor.md_file.open(), save=True) models.FileImage.objects.filter(report=file).delete() # remove old references diff --git a/obstracts/server/serializers.py b/obstracts/server/serializers.py index f46bc1c..f42f20f 100644 --- a/obstracts/server/serializers.py +++ b/obstracts/server/serializers.py @@ -2,6 +2,7 @@ from .models import Profile, Job, FileImage from drf_spectacular.utils import extend_schema_serializer, extend_schema_field from django.utils.translation import gettext_lazy as _ +from dogesec_commons.stixifier.summarizer import parse_summarizer_model class JobSerializer(serializers.ModelSerializer): @@ -12,25 +13,23 @@ class Meta: # fields = "__all__" exclude = ["feed", "profile"] - -class FeedSerializer(serializers.Serializer): +class CreateTaskSerializer(serializers.Serializer): profile_id = serializers.PrimaryKeyRelatedField(queryset=Profile.objects, error_messages={ 'required': _('This field is required.'), 'does_not_exist': _('Invalid profile with id "{pk_value}" - object does not exist.'), 'incorrect_type': _('Incorrect type. Expected profile id (uuid), received {data_type}.'), }) + ai_summary_provider = serializers.CharField(allow_blank=True, allow_null=True, validators=[parse_summarizer_model], default=None) + +class FeedSerializer(CreateTaskSerializer): url = serializers.URLField(help_text="The URL of the RSS or ATOM feed") include_remote_blogs = serializers.BooleanField(help_text="", default=False, required=False) class PatchFeedSerializer(FeedSerializer): url = None -class PatchPostSerializer(serializers.Serializer): - profile_id = serializers.PrimaryKeyRelatedField(queryset=Profile.objects, error_messages={ - 'required': _('This field is required.'), - 'does_not_exist': _('Invalid profile with id "{pk_value}" - object does not exist.'), - 'incorrect_type': _('Incorrect type. Expected profile id (uuid), received {data_type}.'), - }) +class PatchPostSerializer(CreateTaskSerializer): + pass class PostCreateSerializer(PatchPostSerializer): title = serializers.CharField() diff --git a/obstracts/server/views.py b/obstracts/server/views.py index 841d4ab..adb79e8 100644 --- a/obstracts/server/views.py +++ b/obstracts/server/views.py @@ -1,3 +1,4 @@ +import io import json import logging from urllib.parse import urljoin @@ -205,12 +206,14 @@ def make_request(cls, request, path, request_body=None): ) def create(self, request, *args, **kwargs): - profile_id = self.parse_profile(request) + + s = serializers.FeedSerializer(data=request.data) + s.is_valid(raise_exception=True) resp = self.make_request(request, "/api/v1/feeds/") if resp.status_code == 201: out = json.loads(resp.content) out['feed_id'] = out['id'] - job = tasks.new_task(out, profile_id) + job = tasks.new_task(out, s.data['profile_id'], s.data['ai_summary_provider']) return Response(JobSerializer(job).data, status=status.HTTP_201_CREATED) return resp @@ -264,7 +267,7 @@ def partial_update(self, request, *args, **kwargs): if resp.status_code == 201: out = json.loads(resp.content) out['feed_id'] = out['id'] - job = tasks.new_task(out, s.data.get("profile_id", feed.profile.id)) + job = tasks.new_task(out, s.data.get("profile_id", feed.profile.id), s.data['ai_summary_provider']) return Response(JobSerializer(job).data, status=status.HTTP_201_CREATED) return resp @@ -366,7 +369,7 @@ def partial_update(self, request, *args, **kwargs): self.remove_report(post_id, feed.collection_name) out = json.loads(resp.content) out['job_id'] = out['id'] - job = tasks.new_post_patch_task(out, s.data.get("profile_id", feed.profile.id)) + job = tasks.new_post_patch_task(out, s.data.get("profile_id", feed.profile.id), s.data['ai_summary_provider']) return Response(JobSerializer(job).data, status=status.HTTP_201_CREATED) return resp @@ -383,7 +386,7 @@ def create(self, request, *args, **kwargs): if resp.status_code == 201: out = json.loads(resp.content) out['job_id'] = out['id'] - job = tasks.new_post_patch_task(out, s.data.get("profile_id", feed.profile.id)) + job = tasks.new_post_patch_task(out, s.data.get("profile_id", feed.profile.id), s.data['ai_summary_provider']) return Response(JobSerializer(job).data, status=status.HTTP_201_CREATED) return resp @@ -492,7 +495,9 @@ def remove_report(self, post_id, collection): @decorators.action(methods=["GET"], detail=True, serializer_class=JobSerializer) def summarize(self, request, feed_id=None, post_id=None): obj = get_object_or_404(models.File, post_id=post_id) - return FileResponse(streaming_content=obj.summary, content_type='text/markdown', filename='summary.md') + if not obj.summary: + raise exceptions.NotFound(f"No Summary for post") + return FileResponse(streaming_content=io.BytesIO(obj.summary.encode()), content_type='text/markdown', filename='summary.md') @extend_schema_view( list=extend_schema( diff --git a/obstracts/urls.py b/obstracts/urls.py index 1c3604c..2f888b1 100644 --- a/obstracts/urls.py +++ b/obstracts/urls.py @@ -22,8 +22,7 @@ from drf_spectacular.views import SpectacularAPIView, SpectacularRedocView, SpectacularSwaggerView from django.conf import settings import dogesec_commons.objects.views as arango_views -from dogesec_commons.stixifier.views import ProfileView, ExtractorsView, WhitelistsView, AliasesView - +from dogesec_commons.stixifier.views import ProfileView, ExtractorsView API_VERSION = "v1" router = routers.SimpleRouter(use_regex_path=False) @@ -43,8 +42,6 @@ # txt2stix views router.register('extractors', ExtractorsView, "extractors-view") -router.register('whitelists', WhitelistsView, "whitelists-view") -router.register('aliases', AliasesView, "aliases-view") urlpatterns = [ diff --git a/requirements.txt b/requirements.txt index 8007c4d..74a1607 100644 --- a/requirements.txt +++ b/requirements.txt @@ -108,5 +108,5 @@ zipp==3.19.2; python_version >= '3.8' django-storages[s3]==1.14.4 stix2arango @ https://github.com/muchdogesec/stix2arango/archive/main.zip file2txt @ https://github.com/muchdogesec/file2txt/archive/main.zip -txt2stix @ https://github.com/muchdogesec/txt2stix/releases/download/main-2024-11-11/txt2stix-0.0.1b5-py3-none-any.whl -dogesec_commons[stixifier] @ https://github.com/muchdogesec/dogesec_commons/releases/download/main-2024-11-12/dogesec_commons-0.0.1b2-py3-none-any.whl \ No newline at end of file +txt2stix @ https://github.com/muchdogesec/txt2stix/releases/download/main-2024-11-13/txt2stix-0.0.1b5-py3-none-any.whl +dogesec_commons[stixifier] @ https://github.com/muchdogesec/dogesec_commons/releases/download/summarizer-2024-11-13/dogesec_commons-0.0.1b2-py3-none-any.whl \ No newline at end of file From 7dfb40fa8da89f0b2cf62a8697b420a24403ea74 Mon Sep 17 00:00:00 2001 From: Fadl Date: Wed, 13 Nov 2024 16:17:42 +0100 Subject: [PATCH 3/5] add summary and desc --- obstracts/server/views.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/obstracts/server/views.py b/obstracts/server/views.py index adb79e8..fb7c4fa 100644 --- a/obstracts/server/views.py +++ b/obstracts/server/views.py @@ -4,8 +4,8 @@ from urllib.parse import urljoin from django.http import HttpResponse, FileResponse from django.shortcuts import get_object_or_404 -from rest_framework import viewsets, decorators, exceptions, status -from drf_spectacular.utils import OpenApiParameter +from rest_framework import viewsets, decorators, exceptions, status, renderers +from drf_spectacular.utils import OpenApiParameter, OpenApiResponse from drf_spectacular.types import OpenApiTypes from .import autoschema as api_schema from dogesec_commons.objects.helpers import OBJECT_TYPES @@ -39,6 +39,10 @@ from mistune.renderers.markdown import MarkdownRenderer from mistune.util import unescape +class PlainMarkdownRenderer(renderers.BaseRenderer): + media_type = "text/markdown" + format = "text/markdown" + class MarkdownImageReplacer(MarkdownRenderer): def __init__(self, request, queryset): self.request = request @@ -492,7 +496,12 @@ def remove_report(self, post_id, collection): helper.execute_query(query, bind_vars={"@collection": f"{collection}_{c}", 'post_id': post_id}, paginate=False) - @decorators.action(methods=["GET"], detail=True, serializer_class=JobSerializer) + @extend_schema( + responses=None, + description="get summary of the file content", + summary="get summary of the file content", + ) + @decorators.action(methods=["GET"], detail=True) def summarize(self, request, feed_id=None, post_id=None): obj = get_object_or_404(models.File, post_id=post_id) if not obj.summary: From b68c618f15570f6dfbafccfe6b3b7cee71c287e6 Mon Sep 17 00:00:00 2001 From: Fadl Date: Wed, 13 Nov 2024 16:20:12 +0100 Subject: [PATCH 4/5] add desc to ai_summary_provider --- obstracts/server/serializers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/obstracts/server/serializers.py b/obstracts/server/serializers.py index f42f20f..d5cb0b1 100644 --- a/obstracts/server/serializers.py +++ b/obstracts/server/serializers.py @@ -19,7 +19,7 @@ class CreateTaskSerializer(serializers.Serializer): 'does_not_exist': _('Invalid profile with id "{pk_value}" - object does not exist.'), 'incorrect_type': _('Incorrect type. Expected profile id (uuid), received {data_type}.'), }) - ai_summary_provider = serializers.CharField(allow_blank=True, allow_null=True, validators=[parse_summarizer_model], default=None) + ai_summary_provider = serializers.CharField(allow_blank=True, allow_null=True, validators=[parse_summarizer_model], default=None, write_only=True, help_text="AI Summary provider int the format provider:model e.g `openai:gpt-3.5-turbo`") class FeedSerializer(CreateTaskSerializer): url = serializers.URLField(help_text="The URL of the RSS or ATOM feed") From e6a3198a20933bbb4c9fccaef56f62e3235c54ab Mon Sep 17 00:00:00 2001 From: Fadl Date: Thu, 14 Nov 2024 13:24:46 +0100 Subject: [PATCH 5/5] rename summarize to summary --- obstracts/server/views.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/obstracts/server/views.py b/obstracts/server/views.py index fb7c4fa..34993b1 100644 --- a/obstracts/server/views.py +++ b/obstracts/server/views.py @@ -498,16 +498,17 @@ def remove_report(self, post_id, collection): @extend_schema( responses=None, - description="get summary of the file content", - summary="get summary of the file content", + description="Get the summary of the Post", + summary="Get the summary of the post if `ai_summary_provider` was enabled.", ) @decorators.action(methods=["GET"], detail=True) - def summarize(self, request, feed_id=None, post_id=None): + def summary(self, request, feed_id=None, post_id=None): obj = get_object_or_404(models.File, post_id=post_id) if not obj.summary: raise exceptions.NotFound(f"No Summary for post") return FileResponse(streaming_content=io.BytesIO(obj.summary.encode()), content_type='text/markdown', filename='summary.md') + @extend_schema_view( list=extend_schema( summary="Search Jobs",