datasciencecampus
diff --git a/‎README.md
+7-2 b/‎README.md
+7-2
diff --git a/‎outputs/punctuality/.gitkeep b/‎outputs/punctuality/.gitkeep
diff --git a/‎requirements.txt
+3 b/‎requirements.txt
+3
diff --git a/‎run.py
+32 b/‎run.py
+32
diff --git a/‎setup.py
+26-1 b/‎setup.py
+26-1
diff --git a/‎src/bus_metrics/aggregation/preprocessing.py
+6-2 b/‎src/bus_metrics/aggregation/preprocessing.py
+6-2
diff --git a/‎src/bus_metrics/aggregation/punctuality_rate.py
+133 b/‎src/bus_metrics/aggregation/punctuality_rate.py
+133
diff --git a/‎src/bus_metrics/setup/build_lookup.py
+96 b/‎src/bus_metrics/setup/build_lookup.py
+96
@@ -1,5 +1,5 @@
 <!--- Badges start --->
-<img src="https://img.shields.io/badge/repo%20status-in%20development%20(caution)-red" alt="Repository status is still in development (caution required)"/>
+<img src="https://img.shields.io/badge/repo%20status-in%20development%20(caution)-red" alt="Repository status is still in development (caution required)"/> <a href="https://codecov.io/gh/datasciencecampus/bus-metrics-england" > <img src="https://codecov.io/gh/datasciencecampus/bus-metrics-england/branch/dev/graph/badge.svg?token=hnkFyxDgV7"/></a>
 <!--- Badges end --->
 
 <img src="https://github.com/datasciencecampus/awesome-campus/blob/master/ons_dsc_logo.png">
@@ -30,10 +30,15 @@ You will also require a `.env` file in the format:
 BODS_API_KEY="<api key for the BODS service>"
 ```
 
-Data ingest scripts are now available. All resources (including geography and timetable data) and a sample 1 minute cut of real time data can be obtained:
+Data ingest scripts are now available. All resources (including geography and timetable data) and a sample 1 minute cut of real time data can be obtained. Punctuality can be acquired for any of 5 geographies using the `--geography` attribute:
 
 ```shell
 python setup.py
+
+# to be merged into setup
+python src/bus_metrics/aggregation/build_schedules.py
+
+python run.py -g lsoa
 ```
 
 ### Pre-commit actions
 
@@ -10,6 +10,9 @@ pre-commit
 pyarrow==14.0
 pytest
 pytest-pythonpath
+pytest-mock
+pytest-randomly
 Rtree==1.1.0
 python-dotenv
 coverage
+gtfs-realtime-bindings>=0.0.7
@@ -0,0 +1,32 @@
+"""Runs pipeline to reaggregate bus metrics by geography."""
+import argparse
+from src.bus_metrics.aggregation.punctuality_rate import AggregationTool
+
+
+def main():
+    """Reaggregate stop-level punctuality by selected geography.
+
+    Returns
+    -------
+    df: pandas.DataFrame
+        DataFrame of number of service stops and
+        punctuality rate by user-selected geography.
+
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-g", "--geography", nargs="?", help="which geography")
+    args = parser.parse_args()
+
+    if not args.geography:
+        geography = "lsoa"
+    else:
+        geography = args.geography
+
+    aTool = AggregationTool(geography=geography)
+    df = aTool.punctuality_by_geography()
+
+    return df
+
+
+if __name__ == "__main__":
+    main()
@@ -1,6 +1,9 @@
 """Initial pipeline to obtain project resources/data."""
 from src.bus_metrics.setup.ingest_static_data import StaticDataIngest
 from src.bus_metrics.setup.ingest_realtime_data import RealtimeDataIngest
+from src.bus_metrics.setup.build_lookup import create
+
+# from src.bus_metrics.aggregation.build_schedules import Schedule_Builder
 from datetime import datetime
 import logging
 import os
@@ -36,7 +39,7 @@ def data_folder(logger: logging.Logger) -> None:
 
 
 if __name__ == "__main__":  # noqa: C901
-    session_name = f"ingest_{format(scriptStartTime, '%Y_%m_%d_%H:%M')}"
+    session_name = f"ingest_{format(datetime.now(), '%Y_%m_%d_%H:%M')}"
     logger = logging.getLogger(__name__)
     log_fmt = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
     logging.basicConfig(
@@ -70,6 +73,10 @@ def data_folder(logger: logging.Logger) -> None:
         pass
 
     if ingest_toml["download_realtime_sample"]:
+
+        scriptStartTime = datetime.now()
+        scriptStartTimeUnix = time.mktime(scriptStartTime.timetuple())
+
         # TODO: more articulate ways of triggering every 10 seconds
         while (
             time.mktime(datetime.now().timetuple()) < scriptStartTimeUnix + 60
@@ -88,6 +95,24 @@ def data_folder(logger: logging.Logger) -> None:
         logger.warning("##Amend toml to download sample##")
         logger.warning("##Use shell script to download heavy##")
 
+    try:
+        logger.info("Building stops-geography lookup table")
+        # TODO: naming/access of function to be improved
+        create()
+    except Exception as e:
+        logger.warning(f"Build error: {e}")
+        pass
+
+    # build_schedules.py executed separately for now
+    # try:
+    #     schedule_builder = Schedule_Builder()
+    #     logger.info("Preparing punctuality data by stop")
+    #     # TODO: naming/access of method to be improved
+    #     schedule_builder.run()
+    # except Exception as e:
+    #     logger.warning(f"Build error: {e}")
+    #     pass
+
     logger.info("-----------------------------")
     logger.info("-------SETUP COMPLETED-------")
     logger.info("-----------------------------")
@@ -258,13 +258,17 @@ def convert_unix_to_time_string(
     return df
 
 
-def build_stops(output: str = "polars") -> pl.DataFrame | pd.DataFrame:
+def build_stops(
+    output: str = "polars", stops_data: str = "data/resources/gb_stops.csv"
+) -> pl.DataFrame | pd.DataFrame:
     """Read in gb_stops file and outouts as DataFrame.
 
     Parameters
     ----------
     output : str
         Output type, polars or pandas DataFrame. (Defaults "polars").
+    stops_data : str
+        Filepath to raw NAPTAN stops data locally.
 
     Returns
     -------
@@ -274,7 +278,7 @@ def build_stops(output: str = "polars") -> pl.DataFrame | pd.DataFrame:
     """
     # import NapTAN data
     stops = pl.read_csv(
-        "data/resources/gb_stops.csv",
+        stops_data,
         ignore_errors=True,
         dtypes={"stop_id": pl.Utf8},  # noqa: E501
     )
 
@@ -0,0 +1,133 @@
+"""Class of tools required to reaggregate bus metrics by geographies."""
+import pandas as pd
+import toml
+from datetime import datetime
+
+
+class AggregationTool:
+    """Aggregate bus metrics by geographies.
+
+    Parameters
+    ----------
+    stop_level_punctuality: str
+        Full filepath to stop-level punctuality aggregated data
+    config: dict
+        Dictionary of imported toml ingest variables
+    geography: str
+        Geography by which data is to be aggregated
+
+    Attributes
+    ----------
+    code: str
+        Geography code in boundaries data e.g. LSOA21CD
+    name: str
+        Geography name in boundaries data e.g. LSOA21NM
+
+    Methods
+    -------
+    punctuality_by_geography
+        Combine stop-level punctuality data with boundaries
+        and reaggregate
+
+    """
+
+    def __init__(
+        self,
+        config: dict = toml.load("src/bus_metrics/setup/ingest.toml"),
+        geography_lookup_table: str = "data/resources/geography_lookup_table.csv",  # noqa: E501
+        geography: str = "lsoa",
+        outdir: str = "outputs/punctuality",
+    ) -> None:
+
+        self.region = config["region_to_analyse"]
+        self.date = datetime.now().strftime("%Y%m%d")
+        self.stop_level_punctuality: str = f"data/stop_level_punctuality/punctuality_by_stop_{self.region}_{self.date}.csv"  # noqa: E501
+        self.geography_lookup_table = geography_lookup_table
+        self.geography = geography
+        self.config = config
+        self.code = self.config["boundaries"][self.geography]["code"]
+        self.name = self.config["boundaries"][self.geography]["name"]
+        self.outdir = outdir
+
+    def merge_geographies_with_stop_punctuality(
+        self,
+    ) -> pd.DataFrame | Exception:
+        """Merge geography labels and stop-level punctuality.
+
+        Returns
+        -------
+        df: pandas.DataFrame
+            Dataframe of stop-level punctuality with all
+            associated geography labels.
+
+        Raises
+        ------
+        FileNotFoundError
+            When either stops or geography lookup
+            do not exist locally.
+
+        """
+        try:
+            stops = pd.read_csv(self.stop_level_punctuality, index_col=0)
+            lookup = pd.read_csv(self.geography_lookup_table, index_col=0)
+            df = pd.merge(
+                stops,
+                lookup,
+                on=["stop_id", "stop_lat", "stop_lon"],
+                how="left",
+            )
+            return df
+
+        except FileNotFoundError as e:
+            print(e, "Please re-run the build_lookup.py script.")
+            raise
+
+    def _reaggregate_punctuality(
+        self, labelled: pd.DataFrame = None
+    ) -> pd.DataFrame:
+        """Re-aggregate stop-level punctuality by specified geography.
+
+        Parameters
+        ----------
+        labelled: pandas.DataFrame
+            Dataframe of stop-level punctuality with all
+            available geography labels associated with each.
+
+        Returns
+        -------
+        df: pandas.DataFrame
+            Dataframe of number of service stops
+            and punctuality rate aggregated by geography.
+
+        """
+        code = self.config["boundaries"][self.geography]["code"]
+        name = self.config["boundaries"][self.geography]["name"]
+
+        labelled["punctual_service_stops"] = (
+            labelled["service_stops"] * labelled["punctuality_rate"]
+        ).astype(int)
+        df = labelled.groupby([code, name]).agg(
+            {"service_stops": "sum", "punctual_service_stops": "sum"}
+        )
+        df["punctuality_rate"] = (
+            df["punctual_service_stops"] / df["service_stops"]
+        )
+        df = df.reset_index()
+
+        return df
+
+    def punctuality_by_geography(self):
+        """Collect punctuality data, reaggregate and store locally.
+
+        Returns
+        -------
+        df: pandas.DataFrame
+            Dataframe of number of service stops
+            and punctuality rate aggregated by geography.
+
+        """
+        date_time = datetime.now().strftime("%Y%m%d-%H%M%S")
+        df = self.merge_geographies_with_stop_punctuality()
+        df = self._reaggregate_punctuality(df)
+        df.to_csv(f"{self.outdir}/{self.geography}_{date_time}.csv")
+        return df
@@ -0,0 +1,96 @@
+"""Tool to generate lookup table for stops to various geography levels."""
+
+import toml
+import geopandas as gpd
+import pandas as pd
+from src.bus_metrics.setup.ingest_static_data import StaticDataIngest
+
+
+def _build_lookup_tool(
+    stops: pd.DataFrame,
+    bounds: gpd.GeoDataFrame,
+    bounds_code: str,
+    bounds_name: str,
+) -> pd.DataFrame:
+    """Allocate geography labels to bus stops by means of a spatial join.
+
+    Parameters
+    ----------
+    stops: pandas.DataFrame
+        Dataframe of NAPTAN stops data.
+    bounds: geopandas.GeoDataFrame
+        Dataframe of geography boundaries data.
+    bounds_code: str
+        Geography code e.g. LSOA21CD
+    bounds_name: str
+        Geography name e.g. LSOA21NM
+
+    Returns
+    -------
+    df: pandas.DataFrame
+        Dataframe of stops-geography lookup table.
+
+    """
+    stops["geometry"] = gpd.points_from_xy(
+        stops["stop_lon"], stops["stop_lat"]
+    )
+    stops = gpd.GeoDataFrame(stops)
+    stops = stops.set_crs("4326")
+    df = stops.sjoin(bounds, how="left", predicate="within")
+
+    cols = list(stops.columns)
+    cols.extend((bounds_code, bounds_name))
+    df = df[cols]
+
+    return df
+
+
+def create() -> pd.DataFrame:
+    """Download and process boundaries data. Label stops.
+
+    Returns
+    -------
+    stops: pandas.DataFrame
+        Dataframe of stops-geography lookup table.
+
+    """
+    installer = StaticDataIngest()
+    config = toml.load("src/bus_metrics/setup/ingest.toml")
+    boundaries = config["boundaries"]
+
+    # TODO: address mixed dtypes pandas DtypeWarning
+    stops = pd.read_csv("data/resources/gb_stops.csv", index_col=0)
+    stops = stops[stops["Status"] == "active"]
+    stops = stops[["ATCOCode", "Latitude", "Longitude"]]
+    stops.columns = ["stop_id", "stop_lat", "stop_lon"]
+
+    # TODO: consider tqdm progress bar for large file downloads
+    for geog in boundaries:
+        url = boundaries[geog]["url"]
+        filename = boundaries[geog]["filename"]
+        code = boundaries[geog]["code"]
+        name = boundaries[geog]["name"]
+
+        try:
+            print(
+                f"Downloading/Processing FULL RES(!) {geog} boundary data..."
+            )
+            installer.ingest_data_from_geoportal(url, filename)
+
+        except FileExistsError:
+            pass
+
+        bounds = gpd.read_file(filename)
+        stops = _build_lookup_tool(stops, bounds, code, name)
+        stops = stops.drop(columns="geometry")
+
+    print("Storing lookup file...")
+    # note: currently retains all stops in NAPTAN data
+    # irrespective of location across UK
+    stops.to_csv("data/resources/geography_lookup_table.csv")
+
+    return stops
+
+
+if __name__ == "__main__":
+    create()