Initial Commit

sujitmahapatra · sujitmahapatra · commit 5546ff698094 · 2025-02-12T01:38:53.000+05:30
diff --git a/IRCTC Flowchart.png b/IRCTC Flowchart.png
diff --git a/bigquery_create_table.sql b/bigquery_create_table.sql
@@ -0,0 +1,15 @@
+CREATE TABLE `irctc_dwh.irctc_stream_tb` (
+  row_key STRING,
+  name STRING,
+  age INT64,
+  email STRING,
+  join_date DATE,
+  last_login TIMESTAMP,
+  loyalty_points INT64,
+  account_balance FLOAT64,
+  is_active BOOL,
+  inserted_at TIMESTAMP,
+  updated_at TIMESTAMP,
+  loyalty_status STRING,
+  account_age_days INT64
+);
diff --git a/config.py b/config.py
@@ -0,0 +1,9 @@
+# config.py
+
+PROJECT_ID = "gds-project-432013"  # Your GCP Project ID
+TOPIC_ID = "irctc-data"  # Pub/Sub topic for data ingestion
+SUBSCRIPTION_ID = "irctc-data-sub"  # Pub/Sub subscription for streaming
+BQ_DATASET = "irctc_dwh"  # BigQuery dataset name
+BQ_TABLE = "irctc_stream_tb"  # BigQuery table name
+TEMP_LOCATION = "gs://your-bucket/temp"  # GCS bucket for Dataflow temp files
+STAGING_LOCATION = "gs://your-bucket/staging"  # GCS bucket for Dataflow staging files
diff --git a/dataflow_pipeline.py b/dataflow_pipeline.py
@@ -0,0 +1,39 @@
+import apache_beam as beam
+from apache_beam.options.pipeline_options import PipelineOptions, StandardOptions
+import json
+import config
+
+class TransformData(beam.DoFn):
+    def process(self, element):
+        try:
+            record = json.loads(element.decode("utf-8"))
+            record['name'] = record.get('name', '').title()  # Capitalize name
+            record['email'] = record.get('email', '').lower()  # Convert email to lowercase
+            record['loyalty_status'] = 'Platinum' if record.get('loyalty_points', 0) > 500 else 'Standard'
+            yield record
+        except Exception as e:
+            print(f"Error processing record: {e}")
+
+def run():
+    pipeline_options = PipelineOptions(
+        streaming=True,
+        project=config.PROJECT_ID,
+        temp_location=config.TEMP_LOCATION,
+        staging_location=config.STAGING_LOCATION
+    )
+    pipeline_options.view_as(StandardOptions).streaming = True
+
+    with beam.Pipeline(options=pipeline_options) as pipeline:
+        (
+            pipeline
+            | "Read from Pub/Sub" >> beam.io.ReadFromPubSub(subscription=f"projects/{config.PROJECT_ID}/subscriptions/{config.SUBSCRIPTION_ID}")
+            | "Transform Data" >> beam.ParDo(TransformData())
+            | "Write to BigQuery" >> beam.io.WriteToBigQuery(
+                table=f"{config.PROJECT_ID}:{config.BQ_DATASET}.{config.BQ_TABLE}",
+                schema="row_key:STRING, name:STRING, age:INTEGER, email:STRING, join_date:DATE, last_login:TIMESTAMP, loyalty_points:INTEGER, account_balance:FLOAT, is_active:BOOLEAN, inserted_at:TIMESTAMP, updated_at:TIMESTAMP, loyalty_status:STRING, account_age_days:INTEGER",
+                write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND
+            )
+        )
+
+if __name__ == "__main__":
+    run()
diff --git a/irctc_mock_data_to_pubsub.py b/irctc_mock_data_to_pubsub.py
@@ -0,0 +1,70 @@
+from google.cloud import pubsub_v1
+import random
+import string
+import uuid
+import json
+from datetime import datetime, timedelta
+
+# Configuration
+project_id = "gds-project-432013"
+topic_id = "irctc-data"
+
+def initialize_pubsub():
+    try:
+        publisher = pubsub_v1.PublisherClient()
+        topic_path = publisher.topic_path(project_id, topic_id)
+        return publisher, topic_path
+    except Exception as e:
+        print(f"Failed to initialize Pub/Sub client: {e}")
+        raise
+
+# Generate mock data
+def generate_mock_data(num_rows):
+    try:
+        data = []
+        for _ in range(num_rows):
+            row_key = str(uuid.uuid4())
+            row_data = {
+                "row_key": row_key,
+                "name": ''.join(random.choices(string.ascii_letters, k=10)),
+                "age": random.randint(18, 90),
+                "email": ''.join(random.choices(string.ascii_lowercase, k=5)) + "@example.com",
+                "join_date": (datetime.now() - timedelta(days=random.randint(0, 3650))).strftime('%Y-%m-%d'),
+                "last_login": datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
+                "loyalty_points": random.randint(0, 1000),
+                "account_balance": round(random.uniform(100, 10000), 2),
+                "is_active": random.choice([True, False]),
+                "inserted_at": datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S'),
+                "updated_at": None
+            }
+            data.append(row_data)
+        return data
+    except Exception as e:
+        print(f"Failed to generate mock data: {e}")
+        raise
+
+# Publish data to Pub/Sub
+def publish_to_pubsub(publisher, topic_path, data):
+    try:
+        for record in data:
+            message_json = json.dumps(record)
+            message_bytes = message_json.encode('utf-8')
+            future = publisher.publish(topic_path, data=message_bytes)
+            print("Data - > ",message_json)
+            print(f"Published message ID: {future.result()}")
+        print(f"Published {len(data)} messages successfully.")
+    except Exception as e:
+        print(f"Failed to publish data: {e}")
+        raise
+
+# Main execution
+if __name__ == "__main__":
+    try:
+        publisher, topic_path = initialize_pubsub()
+
+        # Generate and publish mock data
+        mock_data = generate_mock_data(20)
+        publish_to_pubsub(publisher, topic_path, mock_data)
+
+    except Exception as e:
+        print(f"An error occurred during the execution: {e}")
diff --git a/transform_udf.py b/transform_udf.py
@@ -0,0 +1,61 @@
+import json
+from datetime import datetime
+
+def transform_data(element):
+    try:
+        # Parse the JSON message
+        record = json.loads(element.replace("'", "\""))
+
+        # Data Cleaning and Validation
+        record['row_key'] = record.get('row_key', '')
+        record['name'] = record.get('name', '').title()  # Capitalize the name
+        record['email'] = record.get('email', '').lower()  # Ensure email is lowercase
+        record['is_active'] = bool(record.get('is_active', False))  # Ensure is_active is a boolean
+        
+        # Enriching Data
+        record['loyalty_status'] = 'Platinum' if record.get('loyalty_points', 0) > 500 else 'Standard'
+        
+        # Convert inserted_at and updated_at to ISO format, handle missing or invalid timestamps
+        inserted_at = record.get('inserted_at')
+        updated_at = record.get('updated_at')
+        
+        if inserted_at:
+            try:
+                record['inserted_at'] = datetime.strptime(inserted_at, '%Y-%m-%d %H:%M:%S').isoformat()
+            except ValueError:
+                record['inserted_at'] = datetime.utcnow().isoformat()
+        else:
+            record['inserted_at'] = datetime.utcnow().isoformat()
+        
+        if updated_at:
+            try:
+                record['updated_at'] = datetime.strptime(updated_at, '%Y-%m-%d %H:%M:%S').isoformat()
+            except ValueError:
+                record['updated_at'] = '1970-01-01T00:00:00'  # Set to Unix epoch if parsing fails
+        else:
+            record['updated_at'] = '1970-01-01T00:00:00'  # Set to Unix epoch if not provided
+        
+        # Calculate account age in days (assumes join_date is in YYYY-MM-DD format)
+        join_date = record.get('join_date')
+        if join_date:
+            try:
+                join_date_obj = datetime.strptime(join_date, '%Y-%m-%d')
+                record['account_age_days'] = (datetime.utcnow() - join_date_obj).days
+            except ValueError:
+                record['account_age_days'] = 0
+        else:
+            record['account_age_days'] = 0  # Default to 0 if join_date is missing
+        
+        # Handling missing or invalid values with defaults
+        record['age'] = record.get('age', 0)
+        record['account_balance'] = record.get('account_balance', 0.0)
+        record['loyalty_points'] = record.get('loyalty_points', 0)
+        record['last_login'] = record.get('last_login', '1970-01-01T00:00:00')  # Default to epoch if missing
+
+        # Return JSON string with double quotes
+        return json.dumps(record)
+    
+    except Exception as e:
+        print(f"Error processing record: {e}")
+        return None  # Handle errors appropriately
+