Added Facilities Import (#2)

* Added facilities pull skeleton * Added remaining synchronization logic and scheduler * Added README * Added readme and sigeca client * Added API * Removed codespace * Synchronization of requirements added
OpenLMIS-Angola · Jun 19, 2024 · a0ecfad · a0ecfad
1 parent 10c7068
commit a0ecfad
Show file tree

Hide file tree

Showing 33 changed files with 2,349 additions and 11 deletions.
diff --git a/.github/workflows/data_sync_microservice-docker-image.yml b/.github/workflows/data_sync_microservice-docker-image.yml
@@ -7,7 +7,7 @@ on:
       - 'release/**'
 
 jobs:
-  build:
+  build-export:
     runs-on: ubuntu-latest
 
     steps:
@@ -31,7 +31,36 @@ jobs:
           push: true
           tags: |
             ghcr.io/openlmis-angola/open-lmis-sigeca-data-export-microservice:latest
-            ghcr.io/openlmis-angola/open-lmis-sigeca_data_export-microservice:${{ github.ref_name }}
+            ghcr.io/openlmis-angola/open-lmis-sigeca-data-export-microservice:${{ github.ref_name }}
+
+      - name: Log out of GitHub Container Registry
+        run: docker logout ghcr.io
+
+  build-import:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v2
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v1
+
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v1
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Build and push Docker image
+        uses: docker/build-push-action@v2
+        with:
+          context: ./sigeca_data_import_microservice  # Adjust the context to the subdirectory
+          push: true
+          tags: |
+            ghcr.io/openlmis-angola/open-lmis-sigeca-data-import-microservice:latest
+            ghcr.io/openlmis-angola/open-lmis-sigeca-data-export-microservice:${{ github.ref_name }}
 
       - name: Log out of GitHub Container Registry
         run: docker logout ghcr.io
diff --git a/.gitignore b/.gitignore
@@ -161,4 +161,7 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
 
-sigeca_data_export_microservice/config.json
+sigeca_data_export_microservice/config.json
+sigeca_data_import_microservice/config.json
+sigeca_data_import_microservice/mocked_facilities.json
+sigeca_data_import_microservice/private_key
diff --git a/sigeca_data_export_microservice/app/application/scheduler/sigeca_data_export_scheduler.py b/sigeca_data_export_microservice/app/application/scheduler/sigeca_data_export_scheduler.py
@@ -1,5 +1,5 @@
 from typing import List
-from apscheduler.schedulers.background import BackgroundScheduler
+from apscheduler.schedulers.background import BlockingScheduler
 from app.application.services.sigeca_data_export_service import DataSyncService
 from app.infrastructure.api import ResourceAPIClient
 from app.infrastructure.utils import ChangeLogOperationEnum
@@ -22,7 +22,7 @@ def __init__(
         synchronized_operations: List[ChangeLogOperationEnum] = None,
     ):
         self.sync_service = sync_service
-        self.scheduler = BackgroundScheduler()
+        self.scheduler = BlockingScheduler()
         self.session_maker = session_maker
         self.sync_interval_minutes = sync_interval_minutes
         self.synchronized_resources = synchronized_resources

diff --git a/sigeca_data_export_microservice/docker-compose.yml b/sigeca_data_export_microservice/docker-compose.yml
@@ -4,12 +4,10 @@ version: '3.8'
 services:
   app:
     build: .
-    container_name: spark_app
+    container_name: export_service
     volumes:
       - .:/app
       - logs:/app/logs
-    ports:
-      - "4040:4040"  # Spark UI
     command: ["python", "main.py", "--run-mode", "continuous"]
     network_mode: "host"
 

diff --git a/sigeca_data_export_microservice/main.py b/sigeca_data_export_microservice/main.py
@@ -31,9 +31,6 @@ def _run_scheduler(session_maker, jdbc_reader, sigeca_data_export_service, sync_
         )
 
         scheduler.start()
-        # Keep the script running
-        while True:
-            pass
     except (KeyboardInterrupt, SystemExit):
         scheduler.stop()
 

diff --git a/sigeca_data_import_microservice/Dockerfile b/sigeca_data_import_microservice/Dockerfile
@@ -0,0 +1,68 @@
+# Use the official Python image from the Docker Hub
+FROM python:3.10-slim
+
+# Set environment variables to avoid interactive prompts during package installation
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Update and install dependencies
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    apt-transport-https \
+    ca-certificates \
+    curl \
+    wget \
+    gnupg \
+    libc-dev \
+    gcc \
+    software-properties-common \
+    libpq-dev \
+    default-jdk \
+    default-jre \
+    file && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+
+# Install Python dependencies
+COPY requirements.txt .
+
+RUN pip install -r requirements.txt
+
+# Set environment variables
+ENV PYTHONPATH=/app
+
+# Copy application code
+COPY . /app
+
+# Spark installation
+ARG spark_version="3.0.1"
+ARG hadoop_version="3.2"
+ARG spark_checksum="E8B47C5B658E0FBC1E57EEA06262649D8418AE2B2765E44DA53AAF50094877D17297CC5F0B9B35DF2CEEF830F19AA31D7E56EAD950BBE7F8830D6874F88CFC3C"
+ARG openjdk_version="11"
+
+ENV APACHE_SPARK_VERSION="${spark_version}" \
+    HADOOP_VERSION="${hadoop_version}"
+
+WORKDIR /tmp
+# Using the preferred mirror to download Spark
+# hadolint ignore=SC2046
+RUN wget -q https://dlcdn.apache.org/spark/spark-3.4.3/spark-3.4.3-bin-hadoop3.tgz
+RUN tar xzf "spark-3.4.3-bin-hadoop3.tgz" -C /usr/local --owner root --group root --no-same-owner && \
+    rm "spark-3.4.3-bin-hadoop3.tgz"
+
+WORKDIR /usr/local
+
+# Configure Spark
+ENV SPARK_HOME=/usr/local/spark
+ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" \
+    PATH=$PATH:$SPARK_HOME/bin
+
+RUN ln -s "spark-3.4.3-bin-hadoop3" spark
+
+WORKDIR /app
+
+# Expose any necessary ports (e.g., for Spark UI)
+EXPOSE 4040
+RUN unset SPARK_HOME
+# Define default command
+CMD ["python", "main.py", "--run-mode", "continuous"]
diff --git a/sigeca_data_import_microservice/README.md b/sigeca_data_import_microservice/README.md
@@ -0,0 +1,151 @@
+# Sigeca Data Export Microservice
+
+This microservice synchronizes data between a local database and an external API using Apache Spark. It supports both continuous synchronization and one-time integration.
+
+## Table of Contents
+
+- [Prerequisites](#prerequisites)
+- [Setup](#setup)
+- [Configuration](#configuration)
+- [Building the Docker Image](#building-the-docker-image)
+- [Running the Application](#running-the-application)
+- [Troubleshooting](#troubleshooting)
+- [Logs](#logs)
+- [Acknowledgements](#acknowledgements)
+
+
+## Prerequisites
+### Docker 
+- Docker and Docker Compose installed on your system
+- An external database (e.g., PostgreSQL) accessible from your Docker network
+
+### Loacl run 
+
+- Python 3.10
+- Java Runtime Environment (JRE) installed
+- Apache Hadoop and Apache Spark
+- An external database (e.g., PostgreSQL) accessible from your Docker network
+
+## Setup
+
+1. Clone the repository:
+
+    ```bash
+    git clone https://github.com/OpenLMIS-Angola/sigeca-synchronization.git
+    cd sigeca-synchronization/sigeca_data_import_microservice
+    ```
+2. Create and run virtual environment 
+
+    ```bash 
+    python3 -m venv venv 
+    source venv/bin/activate
+    ```
+3. Install requirements 
+
+    ```bash
+    python install -r requirements.txt
+    ```
+
+## Configuration
+
+Create the `config.json` file with your specific settings. It can be created based on the provided `config_example.json`:
+
+```json5
+{
+    "open_lmis_api" : { // Used for sending the new entries to the LMIS database
+        "api_url": "https://openlmisapi.example.org/api/",  // URL of the API endpoint
+        "username": "lmis_user", // Authorized user 
+        "password": "password", // Authorized user password 
+        "login_token": "dSFdoi1fb4l6bn16YxhgbxhlbdU=" // Basic token value taken from the client request of the server
+    },
+    "sigeca_api" : {
+        "api_url": "http://exmapleapisiggeca.org/api", // Endpoint used for fetchingsource of truth for facilities
+        "headers": { // Headers used for the synchronization 
+            "ContentType": "application/json"
+        },
+        "credentials": {  // Credentials used for authorization of user 
+            "username": "username",
+            "password": "password"
+        },
+        "skip_verification": false // Skip SSL cerficate validation, USE FOR TEST ONLY
+    },
+    "database": { // DB Concetion used for the validating existing facilities in ORM
+        "username": "db_user",
+        "password": "db_passwd", 
+        "host": "localhost", 
+        "port": 5432,
+        "database": "open_lmis"
+    },
+    "jdbc_reader": { // PySpark connection details for data validation
+        "jdbc_url": "jdbc:postgresql://dbserver.example.org:5432/open_lmis", // Points to db on open_lmis_api
+        "jdbc_user": "db_user", // DB User 
+        "jdbc_password": "db_passwd", // DB Password 
+        "jdbc_driver": "org.postgresql.Driver", // Default driver 
+        "log_level": "WARN", // Log level for spark operations 
+        "ssh_host": "sshOptionalHost", // SSH Server used when tunneling is required to connect to db 
+        "ssh_port": 22, // Port for ssh connection 
+        "ssh_user": "ubuntu", // User 
+        "ssh_private_key_path": "./private_key", // Relative path to the rsa private key 
+        "remote_bind_address": "dbserver.example.org", // Address used to connect to server from ssh server
+        "remote_bind_port": 5432,  // Port used on the remote 
+        "local_bind_port": 5559 // Port binded to localhost 
+    },
+    "sync": {
+        "interval_minutes": 5 // Job interval in minutes 
+    }
+}
+```
+
+## Running the Application
+
+### Continuous Synchronization
+
+To run the application continuously using Docker Compose:
+
+```bash
+docker-compose run app python main.py --run-mode continuous
+```
+
+
+### One-time Integration
+
+To perform a one-time integration:
+
+1. Run the application with the `one-time` argument:
+
+```bash
+docker-compose run app python main.py --run-mode one-time
+```
+
+
+This will run one time task which will synchronize all available data with external system.
+
+## Troubleshooting
+
+### Common Issues
+
+1. **Network Issues**:
+    - Ensure the Docker network is properly set if application is run from separate docker-compose than OpenLMIS. 
+
+2. **Configuration Errors**:
+    - Double-check your `config.json` file for accuracy, especially the database connection details.
+
+3. **Dependency Issues**:
+    - If you encounter issues with Java or Hadoop dependencies, ensure they are correctly installed and the URLs are correct.
+
+### Logs
+
+Logs are stored in the `logs` volume. You can access them for debugging and monitoring purposes:
+
+```bash
+docker-compose logs app
+```
+
+## Acknowledgements
+
+- [Apache Spark](https://spark.apache.org/)
+- [Apache Hadoop](http://hadoop.apache.org/)
+- [Docker](https://www.docker.com/)
+- [OpenLMIS-Angola](https://github.com/OpenLMIS-Angola)
+
+For any questions or issues, please open an issue on the [GitHub repository](https://github.com/OpenLMIS-Angola/sigeca-synchronization/issues).
diff --git a/sigeca_data_import_microservice/app/__init__.py b/sigeca_data_import_microservice/app/__init__.py
diff --git a/sigeca_data_import_microservice/app/application/__init__.py b/sigeca_data_import_microservice/app/application/__init__.py
diff --git a/sigeca_data_import_microservice/app/application/scheduler.py b/sigeca_data_import_microservice/app/application/scheduler.py
@@ -0,0 +1,29 @@
+from .synchronization.facilities import FacilitySynchronizationService
+import logging
+from apscheduler.schedulers.background import BlockingScheduler
+
+
+class FacilitySyncScheduler:
+    def __init__(
+        self,
+        sync_service: FacilitySynchronizationService,
+        interval: int
+    ):
+        self.sync_service = sync_service
+        self.sync_interval_minutes = interval
+        self.scheduler = BlockingScheduler()
+
+    def start(self):
+        self.scheduler.add_job(
+            self.run_sync, "interval", minutes=self.sync_interval_minutes
+        )
+        self.scheduler.start()
+
+    def stop(self):
+        self.scheduler.shutdown()
+
+    def run_sync(self):
+        try:
+            self.sync_service.synchronize_facilities()
+        except Exception as e:
+            logging.exception(f"Synchronization job failed. Error: {e}")
diff --git a/sigeca_data_import_microservice/app/application/synchronization/__init__.py b/sigeca_data_import_microservice/app/application/synchronization/__init__.py