From 338a06e583a1d7e71cdc4159f555f3276e987770 Mon Sep 17 00:00:00 2001 From: "Azwan b. Amit" Date: Wed, 19 Nov 2025 11:14:52 +0800 Subject: [PATCH] update data flow diagram --- docs/data-platform-manual.md | 52 +++++++++++++++--------------------- 1 file changed, 22 insertions(+), 30 deletions(-) diff --git a/docs/data-platform-manual.md b/docs/data-platform-manual.md index 20c2b20..cc30352 100644 --- a/docs/data-platform-manual.md +++ b/docs/data-platform-manual.md @@ -46,41 +46,33 @@ An **S3-compatible storage provider** (e.g., MinIO) used to store and retrieve u ## Workflow ```mermaid -flowchart TD +flowchart TB -%% STAGE 1: DATA SOURCES -A["Data Sources -(S3 / MinIO, DBs, APIs)"] -->|Ingestion Jobs| B[Apache Airflow] +subgraph src ["Data source"] +direction LR +ext_api[/"API
(HTTP, REST, Graph)"/] +ext_s3@{ shape: cyl, label: "Object Storage
(S3, MinIO, GCS)" } +ext_db@{ shape: cyl, label: "Database
(MySQL, PostgreSQL)" } +ext_fs@{ shape: cyl, label: "Filesystem
(HDFS, NAS)" } +end -%% STAGE 2: RAW STORAGE -B -->|Store Raw Data| C["Raw Zone -(S3 / MinIO)"] +subgraph emgr ["Data Platform"] +dag@{ shape: docs, label: "Python DAG" } +af["Airflow"] +tr["Trino"] +ss("Superset") +end -%% STAGE 3: TRANSFORMATION -C -->|DAG / ETL / SQL Queries| D["Trino -(Query Engine)"] -B -->|Workflow Orchestration| D +s3@{ shape: cyl, label: "S3
(MinIO)" } -%% STAGE 4: PROCESSED STORAGE -D -->|Write Processed Data| E["Processed / Curated Zone -(S3 / MinIO)"] +dag -- (1a)
Fetch
raw data
(API, SDK) --> src +dag -- (1b) --> tr +tr -- (1b)
Fetch
raw data
(Trino connector) --> src +af -- (2)
Execute
script --> dag +dag -- (3)
Store
processed
data
(SQL) --> tr +s3 <-- (4)
Read/write data
(Hive / Iceberg format) --> tr +ss -- (5)
Query
processed
data
(SQL) --> tr -%% STAGE 5: QUERY LAYER -E -->|Query Interface| F["Trino -(SQL Access Layer)"] - -%% STAGE 6: VISUALIZATION -F -->|Data Access| G["Apache Superset -(Dashboarding & Analytics)"] - -%% LABELS -classDef core fill:#4a90e2,stroke:#2c3e50,stroke-width:1px,color:white; -classDef storage fill:#6dbf4b,stroke:#2c3e50,stroke-width:1px,color:white; -classDef optional fill:#aaaaaa,stroke:#333,stroke-width:0.5px,color:white; - -class B,D,F,G core; -class C,E storage; -class H1,H2,H3,H4 optional; ``` ## Data Pipeline