From e8027638c051b2d3a47badb9e5dbd1a2d96eff4e Mon Sep 17 00:00:00 2001
From: jsabban <jules.sabban@inrae.fr>
Date: Tue, 12 Sep 2023 11:28:15 +0200
Subject: [PATCH] New step to merge lane fastq files

	Ref: #79
---
 conf/base.config                     |  6 ++++++
 docs/usage.md                        |  4 ++++
 modules/local/module_core.nf         | 26 ++++++++++++++++++++++++++
 nextflow.config                      |  1 +
 sub-workflows/local/core_pipeline.nf | 21 +++++++++++++++++++--
 5 files changed, 56 insertions(+), 2 deletions(-)

diff --git a/conf/base.config b/conf/base.config
index 22bb404..e918941 100644
--- a/conf/base.config
+++ b/conf/base.config
@@ -55,6 +55,12 @@ process {
 		module = toolsModuleHash['R']
 	}
 
+	withName: MERGE_LANES {
+		cpus = 1
+		memory = { 1.GB * task.attempt }
+		time = { 2.h * task.attempt }
+	}
+
 	withName: ILLUMINA_FILTER {
 		publishDir = [
 			path: "${params.outdir}/IlluminaFilter",
diff --git a/docs/usage.md b/docs/usage.md
index 9908d51..10c2427 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -120,6 +120,10 @@ _Default_ : null
 The nG6 like description of the analysis.  
 _Default_ : null
 
+- **`--merge_lanes`** [bool]  
+Merge fastq over the two lanes in CORE pipeline.   
+_Default_ : false
+
 ### Skipping parameters
 There are some availlable flags can be set to not run some parts of the pipeline.  
 - **`--no_subset`** [bool]  
diff --git a/modules/local/module_core.nf b/modules/local/module_core.nf
index cf7e9f6..35bd868 100644
--- a/modules/local/module_core.nf
+++ b/modules/local/module_core.nf
@@ -117,6 +117,32 @@ process DUPLICATED_READS {
 	'''
 }
 
+process MERGE_LANES {	
+	tag "$sample"
+	
+	input:
+		tuple val(sample), path(reads)
+	
+	output:
+		tuple val(sample), path("*.fastq.gz"), emit: fastq
+	
+	script:
+	def args = task.ext.args ?: ''
+	"""
+	#!/bin/bash
+
+	R1_files=\$(ls *R1*)
+	R2_files=\$(ls *R2*)
+
+	for file in \$R1_files; do
+		zcat \$file >> ${sample}_R1_001.fastq
+	done
+
+	for file in \$R2_files; do
+		zcat \$file >> ${sample}_R2_001.fastq
+	done
+	"""
+}
 
 /* --------------------------------------------------------------------
  * 								OLD PROCESS
diff --git a/nextflow.config b/nextflow.config
index d621dab..242b327 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -12,6 +12,7 @@ params {
 	fc_id = ""
 	fc_type = ""
 	lane = ""
+	merge_lanes = false
 
 	data_nature = ""
 	species = ""
diff --git a/sub-workflows/local/core_pipeline.nf b/sub-workflows/local/core_pipeline.nf
index 272b1f2..d837b87 100644
--- a/sub-workflows/local/core_pipeline.nf
+++ b/sub-workflows/local/core_pipeline.nf
@@ -15,8 +15,9 @@ include {
 	FASTQC;
 	FASTQSCREEN;
 	DUPLICATED_READS;
+	MERGE_LANES;
 } from "$baseDir/modules/local/module_core.nf"
-include { GUNZIP			} from "${params.shared_modules}/gzip.nf"
+include { GUNZIP; GZIP		} from "${params.shared_modules}/gzip.nf"
 include { SEQTK_SAMPLE 		} from "${params.shared_modules}/seqtk.nf"
 include { md5sum as MD5SUM	} from "${params.shared_modules}/md5sum.nf"
 //-------------------------------------------------
@@ -28,9 +29,25 @@ isResume=workflow.resume
 //-------------------------------------------------
 workflow CORE {
 	take:
-		ch_read
+		ch_fastq
 		
 	main:
+		// ----------- Lane merging fastq
+		if (params.merge_lanes) {
+			MERGE_LANES(ch_fastq
+				.collect{it[1]}
+				.flatten()
+				.map { $it -> [ ($it.simpleName =~ /(.*)_S\d+_.*/)[0][1] , $it ] }
+				.groupTuple()
+			)
+
+			GZIP(MERGE_LANES.out.fastq)
+
+			ch_read = GZIP.out
+		} else {
+			ch_read = ch_fastq
+		}
+		
 		// ----------- md5sum
 		MD5SUM(ch_read.collect{it[1]}.flatten().collect(), params.run_name+'_fastq')
 		
-- 
GitLab