IFRCGo · sudan45 · Nov 15, 2024 · Nov 15, 2024 · Nov 18, 2024 · Nov 18, 2024
diff --git a/.gitignore b/.gitignore
@@ -129,3 +129,4 @@ dmypy.json
 
 # editors
 .idea/
+source_raw_data/
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "libs/pystac-monty"]
+	path = libs/pystac-monty
+	url = https://github.com/IFRCGo/pystac-monty.git
diff --git a/Dockerfile b/Dockerfile
@@ -8,6 +8,8 @@ WORKDIR /code
 
 COPY pyproject.toml poetry.lock /code/
 
+COPY libs /code/libs
+
 RUN apt-get update -y \
     && apt-get install -y --no-install-recommends \
         # Build required packages

diff --git a/README.md b/README.md
@@ -0,0 +1,25 @@
+## Getting started
+
+- Clone this repository: [email protected]:IFRCGo/montandon-etl.github
+- Go the directory where manage.py exists.
+- Create a .env file and copy all environment variable from sample.env.
+- Set your own environment variables in .env file.
+- Buiid docker using this command:
+    ```bash
+       docker compose up --build -d
+    ```
+- Run migration using this command:
+    ```bash
+       docker-compose exec web python manage.py migrate
+    ```
+- Command to import GDACS data.
+    ```bash
+       docker-compose exec web python manage.py import_gdacs_data
+    ```
+- To view the imported data in the admin panel you need to create yourself as a superuser:
+    ```bash
+       docker-compose exec web python manage.py createsuperuser
+    ```
+    Fill up the form for creating super user.
+- Once user is created, go the browser and request the link localhost:8000/admin/ to view the data in Extraction data table.
+- To go to graphql server go to: localhost:8000/graphql
diff --git a/apps/common/__init__.py b/apps/common/__init__.py
diff --git a/apps/common/admin.py b/apps/common/admin.py
diff --git a/apps/common/apps.py b/apps/common/apps.py
@@ -0,0 +1,6 @@
+from django.apps import AppConfig
+
+
+class CommonConfig(AppConfig):
+    default_auto_field = "django.db.models.BigAutoField"
+    name = "apps.common"
diff --git a/apps/common/dataloaders.py b/apps/common/dataloaders.py
@@ -0,0 +1,14 @@
+import typing
+
+from django.db import models
+
+DjangoModel = typing.TypeVar("DjangoModel", bound=models.Model)
+
+
+def load_model_objects(
+    Model: typing.Type[DjangoModel],
+    keys: list[int],
+) -> list[DjangoModel]:
+    qs = Model.objects.filter(id__in=keys)
+    _map = {obj.pk: obj for obj in qs}
+    return [_map[key] for key in keys]
diff --git a/apps/common/management/__init__.py b/apps/common/management/__init__.py
diff --git a/apps/common/management/commands/generate_schema.py b/apps/common/management/commands/generate_schema.py
@@ -0,0 +1,23 @@
+import argparse
+
+from django.core.management.base import BaseCommand
+from strawberry.printer import print_schema
+
+from main.graphql.schema import schema
+
+
+class Command(BaseCommand):
+    help = "Create schema.graphql file"
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            "--out",
+            type=argparse.FileType("w"),
+            default="schema.graphql",
+        )
+
+    def handle(self, *args, **options):
+        file = options["out"]
+        file.write(print_schema(schema))
+        file.close()
+        self.stdout.write(self.style.SUCCESS(f"{file.name} file generated"))
diff --git a/apps/common/migrations/0001_initial.py b/apps/common/migrations/0001_initial.py
@@ -0,0 +1,37 @@
+# Generated by Django 5.1.3 on 2024-11-21 11:08
+
+import django.db.models.deletion
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    initial = True
+
+    dependencies = [
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name='Region',
+            fields=[
+                ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+                ('name', models.IntegerField(choices=[(0, 'Africa'), (1, 'Americas'), (2, 'Asia Pacific'), (3, 'Europe'), (4, 'Middle East & North Africa')], verbose_name='name')),
+            ],
+        ),
+        migrations.CreateModel(
+            name='Country',
+            fields=[
+                ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+                ('name', models.CharField(blank=True, max_length=255, null=True, verbose_name='name')),
+                ('iso3', models.CharField(blank=True, max_length=3, null=True, verbose_name='iso3')),
+                ('iso', models.CharField(blank=True, max_length=2, null=True, verbose_name='iso2')),
+                ('record_type', models.IntegerField(blank=True, choices=[(1, 'Country'), (2, 'Cluster'), (3, 'Region'), (4, 'Country Office'), (5, 'Representative Office')], help_text='Type of entity', null=True, verbose_name='type')),
+                ('bbox', models.JSONField(blank=True, default=dict, null=True, verbose_name='bbox')),
+                ('centroid', models.JSONField(blank=True, default=dict, null=True, verbose_name='centroid')),
+                ('independent', models.BooleanField(default=None, help_text='Is this an independent country?', null=True)),
+                ('is_deprecated', models.BooleanField(default=False, help_text='Is this an active, valid country?')),
+                ('region', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to='common.region', verbose_name='region')),
+            ],
+        ),
+    ]
diff --git a/apps/common/migrations/__init__.py b/apps/common/migrations/__init__.py
diff --git a/apps/common/models.py b/apps/common/models.py
@@ -0,0 +1,76 @@
+from django.db import models
+from django.utils.translation import gettext_lazy as _
+
+
+class UserResource(models.Model):
+    created_at = models.DateTimeField(auto_now_add=True)
+    modified_at = models.DateTimeField(auto_now=True)
+    # Typing
+    id: int
+    pk: int
+
+    class Meta:
+        abstract = True
+        ordering = ["-id"]
+
+
+class Region(models.Model):
+    class RegionName(models.IntegerChoices):
+        AFRICA = 0, _("Africa")
+        AMERICAS = 1, _("Americas")
+        ASIA_PACIFIC = 2, _("Asia Pacific")
+        EUROPE = 3, _("Europe")
+        MENA = 4, _("Middle East & North Africa")
+
+    name = models.IntegerField(
+        verbose_name=_("name"),
+        choices=RegionName.choices,
+    )
+
+    def __str__(self):
+        return f"{self.name}"
+
+
+class Country(models.Model):
+    class CountryType(models.IntegerChoices):
+        """
+        We use the Country model for some things that are not "Countries". This helps classify the type.
+        """
+
+        COUNTRY = 1, _("Country")
+        CLUSTER = 2, _("Cluster")
+        REGION = 3, _("Region")
+        COUNTRY_OFFICE = 4, _("Country Office")
+        REPRESENTATIVE_OFFICE = 5, _("Representative Office")
+
+    name = models.CharField(max_length=255, verbose_name=_("name"), null=True, blank=True)
+    iso3 = models.CharField(max_length=3, verbose_name=_("iso3"), null=True, blank=True)
+    iso = models.CharField(max_length=2, verbose_name=_("iso2"), null=True, blank=True)
+    record_type = models.IntegerField(
+        choices=CountryType.choices, verbose_name=_("type"), null=True, blank=True, help_text=_("Type of entity")
+    )
+    region = models.ForeignKey(Region, verbose_name=_("region"), null=True, blank=True, on_delete=models.SET_NULL)
+    bbox = models.JSONField(
+        default=dict,
+        null=True,
+        blank=True,
+        verbose_name=_("bbox"),
+    )
+    centroid = models.JSONField(
+        default=dict,
+        null=True,
+        blank=True,
+        verbose_name=_("centroid"),
+    )
+    independent = models.BooleanField(default=None, null=True, help_text=_("Is this an independent country?"))
+    is_deprecated = models.BooleanField(default=False, help_text=_("Is this an active, valid country?"))
+
+    def __str__(self):
+        return f"{self.name} - {self.iso3}"
+
+    def save(self, *args, **kwargs):
+        if self.iso3:
+            self.iso3 = self.iso3.lower()
+        if self.iso:
+            self.iso = self.iso.lower()
+        return super().save(*args, **kwargs)
diff --git a/apps/common/tests.py b/apps/common/tests.py
diff --git a/apps/common/types.py b/apps/common/types.py
@@ -0,0 +1,14 @@
+import strawberry_django
+from django.contrib.auth.models import User
+from strawberry import auto
+
+
+@strawberry_django.type(User)
+class UserMeType:
+    id: auto
+    username: auto
+    first_name: auto
+    last_name: auto
+    email: auto
+    is_staff: auto
+    is_superuser: auto
diff --git a/apps/common/views.py b/apps/common/views.py
diff --git a/apps/etl/__init__.py b/apps/etl/__init__.py
diff --git a/apps/etl/admin.py b/apps/etl/admin.py
@@ -0,0 +1,47 @@
+from django.contrib import admin
+
+# Register your models here.
+from .models import ExtractionData, GdacsTransformation
+
+
+@admin.register(ExtractionData)
+class ExtractionDataAdmin(admin.ModelAdmin):
+    def get_readonly_fields(self, request, obj=None):
+        # Use the model's fields to populate readonly_fields
+        if obj:  # If the object exists (edit page)
+            return [field.name for field in self.model._meta.fields]
+        return []
+
+    list_display = (
+        "id",
+        "source",
+        "resp_code",
+        "status",
+        "parent__id",
+        "resp_data_type",
+        "source_validation_status",
+        "hazard_type",
+        "created_at",
+    )
+    list_filter = ("status",)
+    autocomplete_fields = ["parent"]
+    search_fields = ["parent"]
+
+
+@admin.register(GdacsTransformation)
+class GdacsTransformationAdmin(admin.ModelAdmin):
+    def get_readonly_fields(self, request, obj=None):
+        # Use the model's fields to populate readonly_fields
+        if obj:  # If the object exists (edit page)
+            return [field.name for field in self.model._meta.fields]
+        return []
+
+    list_display = (
+        "id",
+        "extraction",
+        "item_type",
+        "status",
+    )
+    list_filter = ("status",)
+    autocomplete_fields = ["extraction"]
+    search_fields = ["extraction"]
diff --git a/apps/etl/apps.py b/apps/etl/apps.py
@@ -0,0 +1,6 @@
+from django.apps import AppConfig
+
+
+class EtlConfig(AppConfig):
+    default_auto_field = "django.db.models.BigAutoField"
+    name = "apps.etl"
diff --git a/apps/etl/dataloaders.py b/apps/etl/dataloaders.py
@@ -0,0 +1,21 @@
+import typing
+
+from asgiref.sync import sync_to_async
+from common.dataloaders import load_model_objects
+from django.utils.functional import cached_property
+from strawberry.dataloader import DataLoader
+
+from .models import User
+
+if typing.TYPE_CHECKING:
+    from .types import ExtractionDataType
+
+
+def load_extraction(keys: list[int]) -> list["ExtractionDataType"]:
+    return load_model_objects(User, keys)  # type: ignore[reportReturnType]
+
+
+class ExtractionDataLoader:
+    @cached_property
+    def load_extraction(self):
+        return DataLoader(load_fn=sync_to_async(load_extraction))
diff --git a/apps/etl/enums.py b/apps/etl/enums.py
@@ -0,0 +1,9 @@
+import strawberry
+
+from .models import ExtractionData
+
+ExtractionDataStatusTypeEnum = strawberry.enum(ExtractionData.Status, name="ExtractionDataStatusTypeEnum")
+ExtractionValidationTypeEnum = strawberry.enum(
+    ExtractionData.ValidationStatus, name="ExtractionDataValidationStatusTypeEnum"
+)
+ExtractionSourceTypeEnum = strawberry.enum(ExtractionData.Source, name="ExtractionDataSourceTypeEnum")
diff --git a/apps/etl/extract.py b/apps/etl/extract.py
@@ -0,0 +1,85 @@
+import requests
+from celery.utils.log import get_task_logger
+from django.core.exceptions import ObjectDoesNotExist
+
+from .models import ExtractionData
+
+logger = get_task_logger(__name__)
+
+
+class Extraction:
+    def __init__(self, url: str):
+        self.url = url
+
+    def _get_file_extension(self, content_type):
+        mappings = {
+            "application/json": "json",
+            "text/html": "html",
+            "application/xml": "xml",
+            "text/csv": "csv",
+        }
+        return mappings.get(content_type, "txt")
+
+    def pull_data(self, source: int, retry_count: int, timeout: int = 30, ext_object_id: int = None):
+        resp_status = ExtractionData.Status.IN_PROGRESS
+        source_validation_status = ExtractionData.ValidationStatus.NO_VALIDATION
+
+        # Update extraction object status to in_progress
+        if ext_object_id:
+            try:
+                instance_obj = ExtractionData.objects.get(id=ext_object_id)
+                instance_obj.resp_code = resp_status
+                instance_obj.attempt_no = retry_count
+                instance_obj.save(update_fields=["resp_code", "attempt_no"])
+            except ExtractionData.DoesNotExist:
+                raise ObjectDoesNotExist("ExtractionData object with ID {ext_object_id} not found")
+
+        try:
+            response = requests.get(self.url, timeout=timeout)
+            resp_type = response.headers.get("Content-Type", "")
+            file_extension = self._get_file_extension(resp_type)
+
+            # Try saving the data in case of failure
+            if response.status_code != 200:
+                data = {
+                    "source": source,
+                    "url": self.url,
+                    "attempt_no": retry_count,
+                    "resp_code": response.status_code,
+                    "status": ExtractionData.Status.FAILED,
+                    "resp_data": None,
+                    "resp_data_type": "text",
+                    "file_extension": None,
+                    "source_validation_status": ExtractionData.ValidationStatus.NO_VALIDATION,
+                    "content_validation": "",
+                    "resp_text": response.text,
+                }
+
+                for key, value in data.items():
+                    setattr(instance_obj, key, value)
+                instance_obj.save()
+
+                logger.error(f"Request failed with status {response.status_code}")
+                raise Exception("Request failed")
+
+            elif response.status_code == 204:
+                source_validation_status = ExtractionData.ValidationStatus.NO_DATA
+
+            resp_status = ExtractionData.Status.SUCCESS
+
+            return {
+                "source": source,
+                "url": self.url,
+                "attempt_no": retry_count,
+                "resp_code": response.status_code,
+                "status": resp_status,
+                "resp_data": response,
+                "resp_data_type": resp_type,
+                "file_extension": file_extension,
+                "source_validation_status": source_validation_status,
+                "content_validation": "",
+                "resp_text": "",
+            }
+        except requests.exceptions.RequestException as e:
+            logger.error(f"Extraction failed for source {source}: {str(e)}")
+            raise Exception(f"Request failed: {e}")
diff --git a/apps/etl/extraction_validators/__init__.py b/apps/etl/extraction_validators/__init__.py
Original file line number	Diff line number	Diff line change
Expand Up		@@ -129,3 +129,4 @@ dmypy.json

		# editors
		.idea/
		source_raw_data/