Merge pull request #2 from SMD-Bioinformatics-Lund/35-generate-master-html

ryanjameskennedy · web-flow · commit eba0557be4f1 · 2025-01-23T10:43:07.000+01:00
Add py script that generates master html
diff --git a/assets/master_template.html b/assets/master_template.html
@@ -0,0 +1,72 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>16S Samples Report</title>
+    <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
+</head>
+<body>
+    <div class="container my-5">
+        <div class="card">
+            <div class="card-header text-white bg-primary">
+                <h2 class="card-title mb-0">Sample Report</h2>
+            </div>
+            <div class="card-body">
+                <div class="table-responsive">
+                    <table class="table table-bordered table-striped table-hover">
+                        <thead class="table-success">
+                            <tr>
+                                <th rowspan="2">Sample ID</th>
+                                <th colspan="1" class="text-center">Results</th>
+                                <th colspan="1" class="text-center">QC</th>
+                                <th colspan="8" class="text-center">NanoPlot</th>
+                                <th colspan="3" class="text-center">Pipeline</th>
+                            </tr>
+                            <tr>
+                                <th class="text-center">Krona</th>
+                                <th class="text-center">MultiQC Report</th>
+                                <th class="text-center">Report</th>
+                                <th class="text-center">Length vs Quality Scatter (Dot)</th>
+                                <th class="text-center">Length vs Quality Scatter (KDE)</th>
+                                <th class="text-center">Non-weighted Histogram</th>
+                                <th class="text-center">Non-weighted Log-transformed Histogram</th>
+                                <th class="text-center">Weighted Histogram</th>
+                                <th class="text-center">Weighted Log-transformed Histogram</th>
+                                <th class="text-center">Yield by Length</th>
+                                <th class="text-center">Execution Report</th>
+                                <th class="text-center">Execution Timeline</th>
+                                <th class="text-center">DAG</th>
+                            </tr>
+                        </thead>
+                        <tbody>
+                            {% for sample_id in sample_ids %}
+                            <tr>
+                                <td>{{ sample_id }}</td>
+                                <td><a href="./krona/{{ sample_id }}_T1_krona.html">Krona</a></td>
+                                <td><a href="./multiqc/multiqc_report.html">MultiQC</a></td>
+                                <td><a href="./nanoplot/{{ sample_id }}_T1_nanoplot_unprocessedLengthvsQualityScatterPlot_dot.html">Dot Scatter Plot</a></td>
+                                <td><a href="./nanoplot/{{ sample_id }}_T1_nanoplot_unprocessedLengthvsQualityScatterPlot_kde.html">KDE Scatter Plot</a></td>
+                                <td><a href="./nanoplot/{{ sample_id }}_T1_nanoplot_unprocessedNanoPlot-report.html">NanoPlot Report</a></td>
+                                <td><a href="./nanoplot/{{ sample_id }}_T1_nanoplot_unprocessedNon_weightedHistogramReadlength.html">Non-weighted Histogram</a></td>
+                                <td><a href="./nanoplot/{{ sample_id }}_T1_nanoplot_unprocessedNon_weightedLogTransformed_HistogramReadlength.html">Non-weighted Log-transformed Histogram</a></td>
+                                <td><a href="./nanoplot/{{ sample_id }}_T1_nanoplot_unprocessedWeightedHistogramReadlength.html">Weighted Histogram</a></td>
+                                <td><a href="./nanoplot/{{ sample_id }}_T1_nanoplot_unprocessedWeightedLogTransformed_HistogramReadlength.html">Weighted Log-transformed Histogram</a></td>
+                                <td><a href="./nanoplot/{{ sample_id }}_T1_nanoplot_unprocessedYield_By_Length.html">Yield by Length</a></td>
+                                <td><a href="./pipeline_info/execution_report_{{ date_id }}.html">Execution Report</a></td>
+                                <td><a href="./pipeline_info/execution_timeline_{{ date_id }}.html">Execution Timeline</a></td>
+                                <td><a href="./pipeline_info/pipeline_dag_{{ date_id }}.html">Pipeline DAG</a></td>
+                            </tr>
+                            {% endfor %}
+                        </tbody>
+                    </table>
+                </div>
+            </div>
+            <div class="card-footer text-muted">
+                Sequenced on {{ seqrun_date }}
+            </div>
+        </div>
+    </div>
+    <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/js/bootstrap.bundle.min.js"></script>
+</body>
+</html>
diff --git a/bin/generate_master_html.py b/bin/generate_master_html.py
@@ -0,0 +1,123 @@
+#!/usr/bin/env python
+
+"""Generate a master html template."""
+
+import os
+import re
+import argparse
+import pandas as pd
+from jinja2 import Template
+from datetime import datetime
+
+description = '''
+------------------------
+Title: generate_master_html.py
+Date: 2024-12-16
+Author(s): Ryan Kennedy
+------------------------
+Description:
+    This script creates master html file that points to all html files that were outputted from EMU.
+
+List of functions:
+    get_sample_ids, generate_master_html.
+
+List of standard modules:
+    csv, os, argparse.
+
+List of "non standard" modules:
+    pandas, jinja2.
+
+Procedure:
+    1. Get sample IDs by parsing samplesheet csv.
+    2. Render html using template.
+    3. Write out master.html file.
+
+-----------------------------------------------------------------------------------------------------------
+'''
+
+usage = '''
+-----------------------------------------------------------------------------------------------------------
+Generates master html file that points to all html files.
+Executed using: python3 ./generate_master_html.py -i <Input_Directory> -o <Output_Filepath>
+-----------------------------------------------------------------------------------------------------------
+'''
+
+parser = argparse.ArgumentParser(
+                description=description,
+                formatter_class=argparse.RawDescriptionHelpFormatter,
+                epilog=usage
+                )
+parser.add_argument(
+    '-v', '--version',
+    action='version',
+    version='%(prog)s 0.0.1'
+    )
+parser.add_argument(
+    '-c', '--csv',
+    help='input samplesheet csv filepath',
+    metavar='SAMPLESHEET_CSV_FILEPATH',
+    dest='csv',
+    required=True
+    )
+parser.add_argument(
+    '-m', '--html',
+    help='input master html template filepath',
+    metavar='MASTER_HTML_TEMPLATE_FILEPATH',
+    dest='html',
+    required=True
+    )
+
+args = parser.parse_args()
+
+def get_date_id(samplesheet_csv_fpath):
+    date_ids = []
+    parent_dir = os.path.dirname(samplesheet_csv_fpath)
+    pipeline_info_dir = os.path.join(parent_dir, 'pipeline_info')
+    for filename in os.listdir(pipeline_info_dir):
+        if filename.startswith("execution_report"):
+            execution_report_fpath = os.path.join(pipeline_info_dir, filename)
+            date_id = find_date_in_string(execution_report_fpath, r'(\d{4}-\d{2}-\d{2}[^.]+)')
+            date_ids.append(date_id)
+    date_list = map(find_date_in_string, date_ids, [r'\b(\d{4}-\d{2}-\d{2})']*len(date_ids))
+    date_id_zipped = list(zip(date_ids, date_list))
+    sorted_date_ids = [date_id[0] for date_id in sorted(date_id_zipped, key=lambda date: datetime.strptime(date[1], "%Y-%m-%d"), reverse=True)]    
+    return sorted_date_ids[0]
+
+def find_date_in_string(input_string, date_pattern):
+    """Searches for a date within a given string."""
+    date = ""
+    match = re.search(date_pattern, input_string)
+    if match:
+        date_regex = match.group(1)
+        if len(date_regex) == 8:
+            date = datetime.strptime(date_regex, "%Y%m%d").strftime("%d-%m-%Y")
+        elif len(date_regex) > 8:
+            date = date_regex
+        else:
+            date = "(No date found)"
+    return date
+
+def get_sample_ids(samplesheet_csv):
+    """Get sample id from csv."""
+    df = pd.read_csv(samplesheet_csv)
+    sample_ids = df['sample'].tolist()
+    return sample_ids
+
+def generate_master_html(template_html_fpath, sample_ids, seqrun_date, date_id):
+    """Read the template from an HTML file."""
+    with open(template_html_fpath, "r") as file:
+        master_template = file.read()
+    template = Template(master_template)
+    rendered_html = template.render(sample_ids=sample_ids, seqrun_date=seqrun_date, date_id=date_id)
+    return rendered_html
+
+def main():
+    sample_ids = get_sample_ids(args.csv)
+    seqrun_date = find_date_in_string(args.csv, r'/(\d{8})_')
+    date_id = get_date_id(args.csv)
+    rendered_html = generate_master_html(args.html, sample_ids, seqrun_date, date_id)
+    with open("master.html", "w") as fout:
+        fout.write(rendered_html)
+
+if __name__ == "__main__":
+    main()
diff --git a/conf/cmd.config b/conf/cmd.config
@@ -0,0 +1,27 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Nextflow config file for running minimal tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Defines input files and everything required to run a fast and simple pipeline test.
+
+    Use as follows:
+        nextflow run nf-core/gmsemu -profile test,<docker/singularity> --outdir <OUTDIR>
+
+----------------------------------------------------------------------------------------
+*/
+
+params {
+    process.executor            = 'slurm'
+    process.queue               = 'low'
+    config_profile_name         = 'cmd profile'
+    config_profile_description  = 'CMD High performance profile'
+
+    // Databases
+    db                          = '/fs1/pipelines/gms_16S-dev/assets/databases/emu_database'
+
+    // Limit resources so that this can run on GitHub Actions
+    max_cpus                    = 60
+    max_memory                  = '300.GB'
+    max_time                    = '48.h'
+
+}
diff --git a/conf/modules.config b/conf/modules.config
@@ -44,6 +44,14 @@ process {
         ]
     }
 
+    withName: GENERATE_MASTER_HTML {
+        publishDir = [
+            path: { "${params.outdir}/" },
+            mode: params.publish_dir_mode,
+            pattern: 'master.html'
+        ]
+    }
+
     withName: NANOPLOT1 {
         publishDir = [
             path: { "${params.outdir}/nanoplot" },
diff --git a/modules/local/generate_master_html/main.nf b/modules/local/generate_master_html/main.nf
@@ -0,0 +1,19 @@
+process GENERATE_MASTER_HTML {
+    //               Software MUST be pinned to channel (i.e. "bioconda"), version (i.e. "1.10").
+    //               For Conda, the build (i.e. "pyhdfd78af_1") must be EXCLUDED to support installation on different operating systems.
+    conda "conda-forge::nf-core=3.0.2"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/nf-core:3.0.2--pyhdfd78af_1':
+        'quay.io/biocontainers/nf-core:3.0.2' }"
+
+    input:
+        path csv
+
+    output:
+        path 'master.html', emit: master_html
+
+    script:
+    """
+    generate_master_html.py --csv $csv --html $params.master_template
+    """
+}
diff --git a/modules/local/generate_master_html/meta.yml b/modules/local/generate_master_html/meta.yml
@@ -0,0 +1,54 @@
+name: "emu_abundance"
+## TODO nf-core: Add a description of the module and list keywords
+description: A taxonomic profiler for metagenomic 16S data optimized for error prone long reads.
+keywords:
+  - Metagenomics
+  - 16S
+  - Nanopore
+
+tools:
+  - "emu":
+      ## TODO nf-core: Add a description and other details for the software below
+      description: "Emu is a relative abundance estimator for 16s genomic data."
+      homepage: "https://gitlab.com/treangenlab/emu"
+      documentation: "https://gitlab.com/treangenlab/emu"
+      tool_dev_url: "None"
+      doi: "https://doi.org/10.1038/s41592-022-01520-4"
+      licence: "['MIT']"
+
+## TODO nf-core: Add a description of all of the variables used as input
+input:
+  # Only when we have meta
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  #
+  ## TODO nf-core: Delete / customise this example input
+  - reads:
+      type: file
+      description: fastq.gz file containing metagenomic 16S data
+      pattern: "*.{fastq.gz}"
+
+## TODO nf-core: Add a description of all of the variables used as output
+output:
+  #Only when we have meta
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  #
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+  ## TODO nf-core: Delete / customise this example output
+  - report:
+      type: file
+      description: Report (tsv file) over detected species and estimated number of reads and relative abundance
+      pattern: "*{.tsv}"
+
+authors:
+  - "@ryanjameskennedy"
diff --git a/nextflow.config b/nextflow.config
@@ -13,7 +13,7 @@ params {
     input                      = null
     db                         = null
 
-//    reads                    = null
+    // reads                   = null
     seqtype                    = "map-ont"
     min_abundance              = 0.0001
     minimap_max_alignments     = 50
@@ -22,20 +22,20 @@ params {
     keep_files                 = false
     output_unclassified        = true
 
+    // master html
+    master_template            = "$projectDir/assets/master_template.html"
 
-   //
-   // porechop_abi
-    adapter_trimming            = false
+    // porechop_abi
+    adapter_trimming           = false
 
-   //
-   // filtlong filtering
+    // filtlong filtering
     quality_filtering          = true
     longread_qc_qualityfilter_minlength = 1200
     longread_qc_qualityfilter_maxlength = 1800
     longread_qc_qualityfilter_min_mean_q = 94
 
     //Save the trimmed reads
-    save_preprocessed_reads = false
+    save_preprocessed_reads    = false
 
     // krona
     run_krona                  = true
@@ -173,7 +173,7 @@ profiles {
     test      { includeConfig 'conf/test.config'      }
     test_full { includeConfig 'conf/test_full.config' }
     full { includeConfig 'conf/full.config' }
-
+    cmd { includeConfig 'conf/cmd.config' }
 }
 
 
diff --git a/workflows/gmsemu.nf b/workflows/gmsemu.nf
@@ -66,6 +66,7 @@ include { INPUT_CHECK } from '../subworkflows/local/input_check'
 include { MERGE_BARCODES              } from '../modules/local/merge_barcodes/main.nf'
 include { MERGE_BARCODES_SAMPLESHEET  } from '../modules/local/merge_barcodes_samplesheet/main.nf'
 include { GENERATE_INPUT              } from '../modules/local/generate_input/main.nf'
+include { GENERATE_MASTER_HTML        } from '../modules/local/generate_master_html/main.nf'
 //include { FALCO                     } from '../modules/nf-core/falco/main.nf'
 include { NANOPLOT as NANOPLOT1       } from '../modules/nf-core/nanoplot/main.nf'
 include { NANOPLOT  as NANOPLOT2      } from '../modules/nf-core/nanoplot/main.nf'
@@ -93,19 +94,19 @@ workflow GMSEMU {
 
 
     if ( params.merge_fastq_pass && !params.barcodes_samplesheet) {
-        MERGE_BARCODES (params.merge_fastq_pass)
+        MERGE_BARCODES(params.merge_fastq_pass)
         //GENERATE_INPUT(file("${params.outdir}/fastq_pass_merged"))
         GENERATE_INPUT(MERGE_BARCODES.out.fastq_dir_merged)
         //  ch_input = file(params.outdir + 'samplesheet_merged.csv')
         ch_input = GENERATE_INPUT.out.sample_sheet_merged
     } else if ( params.merge_fastq_pass && params.barcodes_samplesheet) {
-        MERGE_BARCODES_SAMPLESHEET (params.barcodes_samplesheet, params.merge_fastq_pass)
+        MERGE_BARCODES_SAMPLESHEET(params.barcodes_samplesheet, params.merge_fastq_pass)
 //        merged_files = (params.outdir + '/fastq_pass_merged')
-        GENERATE_INPUT (MERGE_BARCODES_SAMPLESHEET.out.fastq_dir_merged)
+        GENERATE_INPUT(MERGE_BARCODES_SAMPLESHEET.out.fastq_dir_merged)
         ch_input = GENERATE_INPUT.out.sample_sheet_merged
     }
 
-
+    GENERATE_MASTER_HTML(GENERATE_INPUT.out.sample_sheet_merged)
 
 
     //

Original file line number	Diff line number	Diff line change
`@@ -44,6 +44,14 @@ process {`
`44`	`44`	`]`
`45`	`45`	`}`
`46`	`46`
	`47`	`+ withName: GENERATE_MASTER_HTML {`
	`48`	`+ publishDir = [`
	`49`	`+ path: { "${params.outdir}/" },`
	`50`	`+ mode: params.publish_dir_mode,`
	`51`	`+ pattern: 'master.html'`
	`52`	`+ ]`
	`53`	`+ }`
	`54`	`+`
`47`	`55`	`withName: NANOPLOT1 {`
`48`	`56`	`publishDir = [`
`49`	`57`	`path: { "${params.outdir}/nanoplot" },`