From c3e216fb2b368faebccfc255719bdc4271576809 Mon Sep 17 00:00:00 2001
From: Jonathan Kitt <jonathan.kitt@inrae.fr>
Date: Mon, 27 Jan 2025 13:33:01 +0100
Subject: [PATCH 1/4] move functions to misc/

---
 NAMESPACE                        |  14 --
 R/merge_segments.R               |  95 ------------
 man/calculate_maf.Rd             |  22 ---
 man/count_alleles.Rd             |  22 ---
 man/count_confidences.Rd         |  24 ---
 man/extract_ranges.Rd            |  22 ---
 man/find_cpts.Rd                 |  46 ------
 man/merge_segments.Rd            |  24 ---
 man/normalise_lr.Rd              |  24 ---
 man/regroup_data.Rd              |  24 ---
 man/summarise_segments.Rd        |  26 ----
 man/transform_lr.Rd              |  26 ----
 {R => misc}/calculate_maf.R      |   0
 {R => misc}/count_alleles.R      |   0
 {R => misc}/count_confidences.R  |   0
 {R => misc}/extract_ranges.R     |   0
 {R => misc}/find_cpts.R          |   0
 misc/merge_segments.R            | 257 +++++++++----------------------
 {R => misc}/normalise_lr.R       |   0
 {R => misc}/regroup_data.R       |   0
 {R => misc}/summarise_segments.R |   0
 {R => misc}/transform_lr.R       |   0
 22 files changed, 77 insertions(+), 549 deletions(-)
 delete mode 100644 R/merge_segments.R
 delete mode 100644 man/calculate_maf.Rd
 delete mode 100644 man/count_alleles.Rd
 delete mode 100644 man/count_confidences.Rd
 delete mode 100644 man/extract_ranges.Rd
 delete mode 100644 man/find_cpts.Rd
 delete mode 100644 man/merge_segments.Rd
 delete mode 100644 man/normalise_lr.Rd
 delete mode 100644 man/regroup_data.Rd
 delete mode 100644 man/summarise_segments.Rd
 delete mode 100644 man/transform_lr.Rd
 rename {R => misc}/calculate_maf.R (100%)
 rename {R => misc}/count_alleles.R (100%)
 rename {R => misc}/count_confidences.R (100%)
 rename {R => misc}/extract_ranges.R (100%)
 rename {R => misc}/find_cpts.R (100%)
 rename {R => misc}/normalise_lr.R (100%)
 rename {R => misc}/regroup_data.R (100%)
 rename {R => misc}/summarise_segments.R (100%)
 rename {R => misc}/transform_lr.R (100%)

diff --git a/NAMESPACE b/NAMESPACE
index 23a2879..519854c 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,22 +1,8 @@
 # Generated by roxygen2: do not edit by hand
 
 export(calculate_lr)
-export(calculate_maf)
-export(count_alleles)
-export(count_confidences)
-export(extract_ranges)
-export(find_cpts)
-export(merge_segments)
-export(normalise_lr)
 export(read_axiom)
-export(regroup_data)
-export(summarise_segments)
-export(transform_lr)
-import(GenomicRanges)
-import(changepoint)
 import(dplyr)
-import(plyranges)
-import(purrr)
 import(readr)
 import(stringr)
 import(tidyr)
diff --git a/R/merge_segments.R b/R/merge_segments.R
deleted file mode 100644
index 6ccd03f..0000000
--- a/R/merge_segments.R
+++ /dev/null
@@ -1,95 +0,0 @@
-#' Merge consecutive segments and recalculate parameters
-#'
-#' @param segments_file segments file
-#' @param regrouped_data_file lrn & calls file
-#'
-#' @import dplyr
-#' @import GenomicRanges
-#'
-#' @return A [tibble()].
-#' @export
-#'
-#' @examples
-#' \dontrun{
-#' merge_segments(segments_file, regrouped_data_file)
-#' }
-
-merge_segments <- function(segments_file, regrouped_data_file) {
-
-  # Set NULL variables
-
-  chromosome <- file_name <- group_name <- high_lrn_count <- mseg_end <-
-    mseg_start <- position <- seg_nb <- value <- NULL
-
-  # Merge consecutive segments
-  merged_segments <- segments_file %>%
-    dplyr::select(file_name, chromosome, start = seg_nb, end = seg_nb) %>%
-    GenomicRanges::makeGRangesListFromDataFrame(split.field = "file_name") %>%
-    GenomicRanges::reduce() %>%
-    tibble::as_tibble() %>%
-    dplyr::select(file_name = group_name,
-                  chromosome = seqnames,
-                  mseg_start = start,
-                  mseg_end = end)
-
-  # Keep non-merged segments
-  same_segments <- merged_segments %>%
-    dplyr::filter(mseg_start == mseg_end) %>%
-    dplyr::select(file_name, chromosome, seg_nb = mseg_start) %>%
-    dplyr::left_join(segments_file) %>%
-    dplyr::select(file_name, chromosome, start:high_lrn_count)
-
-  if (nrow(same_segments) != nrow(merged_segments)) {
-
-    # Extract merged segments and recalculate parameters
-    diff_segments <- merged_segments %>%
-      dplyr::filter(mseg_start != mseg_end) %>%
-      tidyr::pivot_longer(!c(file_name, chromosome)) %>%
-      dplyr::select(file_name, chromosome, seg_nb = value) %>%
-      dplyr::left_join(segments_file) %>%
-      dplyr::select(file_name, chromosome, start:end) %>%
-      dplyr::group_by(file_name, chromosome) %>%
-      dplyr::mutate(start = min(start),
-                    end = max(end)) %>%
-      dplyr::slice(1)
-
-    lrn_means <- list()
-    snp_counts <- list()
-    otv_counts <- list()
-    low_lrn_counts <- list()
-    high_lrn_counts <- list()
-
-    for (i in 1:nrow(diff_segments)) {
-      d1 <- regrouped_data_file %>%
-        dplyr::filter(file_name == diff_segments$file_name[i] &
-                        chromosome == diff_segments$chromosome[i] &
-                        position >= diff_segments$start[i] &
-                        position <= diff_segments$end[i])
-
-      lrn_means[[i]] <- mean(d1$lrn)
-      snp_counts[[i]] <- nrow(d1)
-      otv_counts[[i]] <- length(which(d1$call == "-2"))
-      low_lrn_counts[[i]] <- length(which(d1$lrn <= -1))
-      high_lrn_counts[[i]] <- length(which(d1$lrn >= 1))
-    }
-
-    diff_segments$lrn_mean <- unlist(lrn_means)
-    diff_segments$snp_count <- unlist(snp_counts)
-    diff_segments$otv_count <- unlist(otv_counts)
-    diff_segments$low_lrn_count <- unlist(low_lrn_counts)
-    diff_segments$high_lrn_count <- unlist(high_lrn_counts)
-
-    # rbind same_segments & diff_segments + reorder
-    final_mseg <- rbind(same_segments, diff_segments)
-
-  }
-
-  if (nrow(same_segments) == nrow(merged_segments)) {
-
-    final_mseg <- same_segments
-
-  }
-
-  return(final_mseg)
-
-}
diff --git a/man/calculate_maf.Rd b/man/calculate_maf.Rd
deleted file mode 100644
index 0b7ba2c..0000000
--- a/man/calculate_maf.Rd
+++ /dev/null
@@ -1,22 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/calculate_maf.R
-\name{calculate_maf}
-\alias{calculate_maf}
-\title{Calculate MAF from allele counts}
-\usage{
-calculate_maf(allele_count)
-}
-\arguments{
-\item{allele_count}{allele count file}
-}
-\value{
-A \code{\link[=tibble]{tibble()}}.
-}
-\description{
-Calculate MAF from allele counts
-}
-\examples{
-\dontrun{
-calculate_maf(allele_count)
-}
-}
diff --git a/man/count_alleles.Rd b/man/count_alleles.Rd
deleted file mode 100644
index 647e301..0000000
--- a/man/count_alleles.Rd
+++ /dev/null
@@ -1,22 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/count_alleles.R
-\name{count_alleles}
-\alias{count_alleles}
-\title{Count alleles for each SNP in AximGT1.calls.txt data}
-\usage{
-count_alleles(calls_file)
-}
-\arguments{
-\item{calls_file}{calls file}
-}
-\value{
-A \code{\link[=tibble]{tibble()}}.
-}
-\description{
-Count alleles for each SNP in AximGT1.calls.txt data
-}
-\examples{
-\dontrun{
-count_alleles(calls_file)
-}
-}
diff --git a/man/count_confidences.Rd b/man/count_confidences.Rd
deleted file mode 100644
index 0a1210a..0000000
--- a/man/count_confidences.Rd
+++ /dev/null
@@ -1,24 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/count_confidences.R
-\name{count_confidences}
-\alias{count_confidences}
-\title{Count number of values above confidence score threshold for each SNP in AxiomGT1.confidences.txt data}
-\usage{
-count_confidences(confidences_file, threshold = 0.15)
-}
-\arguments{
-\item{confidences_file}{confidences file}
-
-\item{threshold}{confidence score threshold (defaults to 0.15)}
-}
-\value{
-A \code{\link[=tibble]{tibble()}}.
-}
-\description{
-Count number of values above confidence score threshold for each SNP in AxiomGT1.confidences.txt data
-}
-\examples{
-\dontrun{
-count_confidences(confidences_file, threshold)
-}
-}
diff --git a/man/extract_ranges.Rd b/man/extract_ranges.Rd
deleted file mode 100644
index 555151e..0000000
--- a/man/extract_ranges.Rd
+++ /dev/null
@@ -1,22 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/extract_ranges.R
-\name{extract_ranges}
-\alias{extract_ranges}
-\title{Extract ranges}
-\usage{
-extract_ranges(snps)
-}
-\arguments{
-\item{snps}{list of SNPs}
-}
-\value{
-a table with minimum and maximum SNP positions for each chromosome
-}
-\description{
-Extract ranges
-}
-\examples{
-\dontrun{
-extract_ranges(snps)
-}
-}
diff --git a/man/find_cpts.Rd b/man/find_cpts.Rd
deleted file mode 100644
index a518517..0000000
--- a/man/find_cpts.Rd
+++ /dev/null
@@ -1,46 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/find_cpts.R
-\name{find_cpts}
-\alias{find_cpts}
-\title{Find changepoints in lrn data}
-\usage{
-find_cpts(
-  lr_st,
-  calc = "mean",
-  var = "lr_st_ref",
-  method = "BinSeg",
-  penalty = "MBIC",
-  pen_value = 0,
-  max_cpts = 3,
-  count_otvs = TRUE
-)
-}
-\arguments{
-\item{lr_st}{standardized lr values}
-
-\item{calc}{changepoints will be calculated w.r.t. this value ("mean", "var" or "mean_var")}
-
-\item{var}{variable  to be  used for changepoint detection}
-
-\item{method}{changepoint detection method (defaults to "BinSeg")}
-
-\item{penalty}{changepoint detection penalty (defaults to "MBIC")}
-
-\item{pen_value}{penalty value (defaults  to 0)}
-
-\item{max_cpts}{maximum number of  changepoints (defaults to 3)}
-
-\item{count_otvs}{whether or not to add number of OTVs in detected segments (defaults to TRUE)}
-}
-\value{
-a \code{\link[=tibble]{tibble()}}
-}
-\description{
-Find changepoints in lrn data
-}
-\examples{
-\dontrun{
-find_cpts(lr_st, calc = "mean",  var = "lr_st_ref", method = "BinSeg",
-penalty = "MBIC", pen_value = 0, max_cpts = 3, count_otvs = TRUE)
-}
-}
diff --git a/man/merge_segments.Rd b/man/merge_segments.Rd
deleted file mode 100644
index f4276c8..0000000
--- a/man/merge_segments.Rd
+++ /dev/null
@@ -1,24 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/merge_segments.R
-\name{merge_segments}
-\alias{merge_segments}
-\title{Merge consecutive segments and recalculate parameters}
-\usage{
-merge_segments(segments_file, regrouped_data_file)
-}
-\arguments{
-\item{segments_file}{segments file}
-
-\item{regrouped_data_file}{lrn & calls file}
-}
-\value{
-A \code{\link[=tibble]{tibble()}}.
-}
-\description{
-Merge consecutive segments and recalculate parameters
-}
-\examples{
-\dontrun{
-merge_segments(segments_file, regrouped_data_file)
-}
-}
diff --git a/man/normalise_lr.Rd b/man/normalise_lr.Rd
deleted file mode 100644
index 8c07644..0000000
--- a/man/normalise_lr.Rd
+++ /dev/null
@@ -1,24 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/normalise_lr.R
-\name{normalise_lr}
-\alias{normalise_lr}
-\title{Normalise lr values}
-\usage{
-normalise_lr(lr, norm)
-}
-\arguments{
-\item{lr}{lr values}
-
-\item{norm}{sample used for normalisation}
-}
-\value{
-a \code{\link[=tibble]{tibble()}}
-}
-\description{
-Normalise lr values
-}
-\examples{
-\dontrun{
-normalise_lr(lr, norm)
-}
-}
diff --git a/man/regroup_data.Rd b/man/regroup_data.Rd
deleted file mode 100644
index 48b1de2..0000000
--- a/man/regroup_data.Rd
+++ /dev/null
@@ -1,24 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/regroup_data.R
-\name{regroup_data}
-\alias{regroup_data}
-\title{Regroup lrn and calls data}
-\usage{
-regroup_data(lrn_file, calls_file)
-}
-\arguments{
-\item{lrn_file}{lrn file}
-
-\item{calls_file}{calls file}
-}
-\value{
-A \code{\link[=tibble]{tibble()}}.
-}
-\description{
-Regroup lrn and calls data
-}
-\examples{
-\dontrun{
-regroup_data(lrn_file, calls_file)
-}
-}
diff --git a/man/summarise_segments.Rd b/man/summarise_segments.Rd
deleted file mode 100644
index 7644dd7..0000000
--- a/man/summarise_segments.Rd
+++ /dev/null
@@ -1,26 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/summarise_segments.R
-\name{summarise_segments}
-\alias{summarise_segments}
-\title{Summarise segments : count OTVs}
-\usage{
-summarise_segments(raw_segments_file, regrouped_data_file, snps_file)
-}
-\arguments{
-\item{raw_segments_file}{raw segments file}
-
-\item{regrouped_data_file}{file containing lrn values and calls}
-
-\item{snps_file}{list of SNPs}
-}
-\value{
-A \code{\link[=tibble]{tibble()}}.
-}
-\description{
-Summarise segments : count OTVs
-}
-\examples{
-\dontrun{
-summarise_segments(raw_segments_file, regrouped_data_file, snps_file)
-}
-}
diff --git a/man/transform_lr.Rd b/man/transform_lr.Rd
deleted file mode 100644
index 384864a..0000000
--- a/man/transform_lr.Rd
+++ /dev/null
@@ -1,26 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/transform_lr.R
-\name{transform_lr}
-\alias{transform_lr}
-\title{Transform lr values}
-\usage{
-transform_lr(lr, method = "ref", ref)
-}
-\arguments{
-\item{lr}{lr values}
-
-\item{method}{how to perform standardization ("ref", "mean", or "both")}
-
-\item{ref}{sample used for standardization}
-}
-\value{
-a \code{\link[=tibble]{tibble()}}
-}
-\description{
-Transform lr values
-}
-\examples{
-\dontrun{
-transform_lr(lr, method = "ref", ref)
-}
-}
diff --git a/R/calculate_maf.R b/misc/calculate_maf.R
similarity index 100%
rename from R/calculate_maf.R
rename to misc/calculate_maf.R
diff --git a/R/count_alleles.R b/misc/count_alleles.R
similarity index 100%
rename from R/count_alleles.R
rename to misc/count_alleles.R
diff --git a/R/count_confidences.R b/misc/count_confidences.R
similarity index 100%
rename from R/count_confidences.R
rename to misc/count_confidences.R
diff --git a/R/extract_ranges.R b/misc/extract_ranges.R
similarity index 100%
rename from R/extract_ranges.R
rename to misc/extract_ranges.R
diff --git a/R/find_cpts.R b/misc/find_cpts.R
similarity index 100%
rename from R/find_cpts.R
rename to misc/find_cpts.R
diff --git a/misc/merge_segments.R b/misc/merge_segments.R
index 249ce8a..6ccd03f 100644
--- a/misc/merge_segments.R
+++ b/misc/merge_segments.R
@@ -1,198 +1,95 @@
-#' Filter and merge segments obtained from changepoint detection
+#' Merge consecutive segments and recalculate parameters
 #'
-#' @description
-#' This function filters and merges segments detected using the \code{detect_cpts()} function.
+#' @param segments_file segments file
+#' @param regrouped_data_file lrn & calls file
 #'
-#' @import data.table
-#' @importFrom plyr join
-#' @importFrom GenomicRanges makeGRangesListFromDataFrame reduce
-#'
-#' @param array name of the array to be analyzed.
-#' @param raw_seg_file path to the \code{raw_segments} file.
-#' @param mean_cutoff threshold lrn_mean value (absolute value) used to determine whether a segment should
-#' be kept (defaults to 0.25).
-#' @param size_cutoff threshold segment size value used to determine whether a segment should be kept (defaults to 500kb).
-#' @param output_path where the results will be saved
-#'
-#' @details
-#' The merging is done in two consecutive steps :
-#' - merge consecutive segments with a lrn_mean value regardless of gaps
-#' - merge segments separated by another segment with a size inferior to size_cutoff
-#'
-#' @usage merge_segments(array, raw_seg_file, mean_cutoff = 0.25,
-#' size_cutoff = 500e3, output_path)
-#'
-#' @return A \code{merged_segments} data.table containing the following columns :
-#' Returns a table containing the following columns :
-#' - species
-#' - array_type
-#' - array_name
-#' - unique_id
-#' - sample_name
-#' - chromosome
-#' - start
-#' - end
-#' - sv_type
+#' @import dplyr
+#' @import GenomicRanges
 #'
+#' @return A [tibble()].
 #' @export
 #'
 #' @examples
 #' \dontrun{
-#' merge_segments(array, raw_seg_file, mean_cutoff, size_cutoff, output_path)
+#' merge_segments(segments_file, regrouped_data_file)
 #' }
 
-merge_segments <- function(array, raw_seg_file, mean_cutoff = 0.25,
-                           size_cutoff = 500e3, output_path) {
-
-  lrn_mean <- end <- start <- nb_cpts <- seg_nb <-
-    . <- array_name <- analysis_group <- unique_id <-
-      chromosome <- head <- size <- width <- lrn_cutoff <-
-    raw_seg_file <- NULL
-
-  # Check arguments ----
-
-  if (missing(array)) stop("array is required")
-  if (missing(raw_seg_file)) stop("raw_seg_file is required")
-  if (missing(output_path)) stop("output_path is required")
-
-  # Create folders for saving results ----
-
-  if (!dir.exists(paste0(output_path, "results/"))) {
-    dir.create(paste0(output_path, "results/"))
-  }
-
-  if (!dir.exists(paste0(output_path, "results/merged_segments"))) {
-    dir.create(paste0(output_path, "results/merged_segments/"))
-  }
-
-  # 1. Import data and prepare files ----
-
-  # 1.1. import segmentation file
-  raw_seg <- data.table::fread(raw_seg_file)
-  rm(raw_seg_file)
-
-  # 1.3. extract list of samples
-  smp <- raw_seg[, 1:5]
-  smp <- smp[!duplicated(smp), ]
-
-  # 1.4. split segments into seg_neg & seg_pos + filter
-  seg_neg <- raw_seg[lrn_mean <= -mean_cutoff][, c(4, 7, 9:11)]
-  seg_pos <- raw_seg[lrn_mean >= mean_cutoff][, c(4, 7, 9:11)]
-  rm(raw_seg)
-
-  # 2. Merge consecutive segments ----
-
-  # 2.1. create GRangesLists
-
-  seg_nb_neg <- seg_neg[, c(1:3, 3)]
-  seg_nb_pos <- seg_pos[, c(1:3, 3)]
-
-  names(seg_nb_neg) <- c("unique_id", "chromosome", "start", "end")
-  names(seg_nb_pos) <- c("unique_id", "chromosome", "start", "end")
+merge_segments <- function(segments_file, regrouped_data_file) {
+
+  # Set NULL variables
+
+  chromosome <- file_name <- group_name <- high_lrn_count <- mseg_end <-
+    mseg_start <- position <- seg_nb <- value <- NULL
+
+  # Merge consecutive segments
+  merged_segments <- segments_file %>%
+    dplyr::select(file_name, chromosome, start = seg_nb, end = seg_nb) %>%
+    GenomicRanges::makeGRangesListFromDataFrame(split.field = "file_name") %>%
+    GenomicRanges::reduce() %>%
+    tibble::as_tibble() %>%
+    dplyr::select(file_name = group_name,
+                  chromosome = seqnames,
+                  mseg_start = start,
+                  mseg_end = end)
+
+  # Keep non-merged segments
+  same_segments <- merged_segments %>%
+    dplyr::filter(mseg_start == mseg_end) %>%
+    dplyr::select(file_name, chromosome, seg_nb = mseg_start) %>%
+    dplyr::left_join(segments_file) %>%
+    dplyr::select(file_name, chromosome, start:high_lrn_count)
+
+  if (nrow(same_segments) != nrow(merged_segments)) {
+
+    # Extract merged segments and recalculate parameters
+    diff_segments <- merged_segments %>%
+      dplyr::filter(mseg_start != mseg_end) %>%
+      tidyr::pivot_longer(!c(file_name, chromosome)) %>%
+      dplyr::select(file_name, chromosome, seg_nb = value) %>%
+      dplyr::left_join(segments_file) %>%
+      dplyr::select(file_name, chromosome, start:end) %>%
+      dplyr::group_by(file_name, chromosome) %>%
+      dplyr::mutate(start = min(start),
+                    end = max(end)) %>%
+      dplyr::slice(1)
+
+    lrn_means <- list()
+    snp_counts <- list()
+    otv_counts <- list()
+    low_lrn_counts <- list()
+    high_lrn_counts <- list()
+
+    for (i in 1:nrow(diff_segments)) {
+      d1 <- regrouped_data_file %>%
+        dplyr::filter(file_name == diff_segments$file_name[i] &
+                        chromosome == diff_segments$chromosome[i] &
+                        position >= diff_segments$start[i] &
+                        position <= diff_segments$end[i])
+
+      lrn_means[[i]] <- mean(d1$lrn)
+      snp_counts[[i]] <- nrow(d1)
+      otv_counts[[i]] <- length(which(d1$call == "-2"))
+      low_lrn_counts[[i]] <- length(which(d1$lrn <= -1))
+      high_lrn_counts[[i]] <- length(which(d1$lrn >= 1))
+    }
+
+    diff_segments$lrn_mean <- unlist(lrn_means)
+    diff_segments$snp_count <- unlist(snp_counts)
+    diff_segments$otv_count <- unlist(otv_counts)
+    diff_segments$low_lrn_count <- unlist(low_lrn_counts)
+    diff_segments$high_lrn_count <- unlist(high_lrn_counts)
+
+    # rbind same_segments & diff_segments + reorder
+    final_mseg <- rbind(same_segments, diff_segments)
 
-  seg_nb_neg <- GenomicRanges::makeGRangesListFromDataFrame(df = seg_nb_neg, split.field = "unique_id")
-  seg_nb_pos <- GenomicRanges::makeGRangesListFromDataFrame(df = seg_nb_pos,split.field = "unique_id")
-
-  # 2.2. reduce GRangesLists
-
-  mseg_nb_neg <- GenomicRanges::reduce(seg_nb_neg)
-  mseg_nb_pos <- GenomicRanges::reduce(seg_nb_pos)
-
-  # 2.3. create tables
-
-  mseg_nb_neg_list <- list()
-  for (i in 1:length(mseg_nb_neg)) {
-    mseg_nb_neg_list[[i]] <- data.table::as.data.table(mseg_nb_neg[[i]])
-    mseg_nb_neg_list[[i]]$unique_id <- names(mseg_nb_neg[i])
-  }
-  mseg_nb_neg_tb <- data.table::rbindlist(mseg_nb_neg_list)[, c(1:3, 6)]
-
-  mseg_nb_pos_list <- list()
-  for (i in 1:length(mseg_nb_pos)) {
-    mseg_nb_pos_list[[i]] <- data.table::as.data.table(mseg_nb_pos[[i]])
-    mseg_nb_pos_list[[i]]$unique_id <- names(mseg_nb_pos[i])
-  }
-  mseg_nb_pos_tb <- data.table::rbindlist(mseg_nb_pos_list)[, c(1:3, 6)]
-
-  # 2.4. extract physical positions
-
-  mseg_nb_neg <- list()
-  for (i in 1:nrow(mseg_nb_neg_tb)) {
-    d1 <- seg_neg[unique_id == mseg_nb_neg_tb$unique_id[i] & chromosome == mseg_nb_neg_tb$seqnames[i]]
-    mseg_nb_neg[[i]] <- data.table::data.table(
-      chromosome = unique(d1$chromosome),
-      unique_id = unique(d1$unique_id),
-      start = d1$start[d1$seg_nb == mseg_nb_neg_tb$start[i]],
-      end = d1$end[d1$seg_nb == mseg_nb_neg_tb$end[i]]
-    )
-  }
-  mseg_nb_neg_dt <- data.table::rbindlist(mseg_nb_neg)
-
-  mseg_nb_pos <- list()
-  for (i in 1:nrow(mseg_nb_pos_tb)) {
-    d1 <- seg_pos[unique_id == mseg_nb_pos_tb$unique_id[i] & chromosome == mseg_nb_pos_tb$seqnames[i]]
-    mseg_nb_pos[[i]] <- data.table::data.table(
-      chromosome = unique(d1$chromosome),
-      unique_id = unique(d1$unique_id),
-      start = d1$start[d1$seg_nb == mseg_nb_pos_tb$start[i]],
-      end = d1$end[d1$seg_nb == mseg_nb_pos_tb$end[i]]
-    )
   }
-  mseg_nb_pos_dt <- data.table::rbindlist(mseg_nb_pos)
-
-  rm(d1, mseg_nb_neg, mseg_nb_neg_tb, mseg_nb_pos, mseg_nb_pos_tb,
-     seg_nb_neg, seg_nb_pos, seg_neg, seg_pos, i, mean_cutoff, mseg_nb_neg_list, mseg_nb_pos_list)
-
-  # 3. Merge segments according to gaps ----
 
-  # 3.1. create GRangesLists
+  if (nrow(same_segments) == nrow(merged_segments)) {
 
-  seg_neg <- GenomicRanges::makeGRangesListFromDataFrame(df = mseg_nb_neg_dt, split.field = "unique_id")
-  seg_pos <- GenomicRanges::makeGRangesListFromDataFrame(df = mseg_nb_pos_dt, split.field = "unique_id")
+    final_mseg <- same_segments
 
-  # 3.2. reduce GRangesLists
-
-  mseg_neg <- GenomicRanges::reduce(seg_neg, min.gapwidth = size_cutoff)
-  mseg_pos <- GenomicRanges::reduce(seg_pos, min.gapwidth = size_cutoff)
-
-  merged_segments_neg <- list()
-  for (i in 1:length(mseg_neg)) {
-    merged_segments_neg[[i]] <- data.table::as.data.table(mseg_neg[[i]])
-    merged_segments_neg[[i]]$unique_id <- names(mseg_neg[i])
-    merged_segments_neg[[i]]$sv_type <- "negative"
-  }
-  merged_segments_neg_table <- data.table::rbindlist(merged_segments_neg)
-  merged_segments_neg_table <- merged_segments_neg_table[width >= size_cutoff][, c(1:3, 6:7)]
-
-  merged_segments_pos <- list()
-  for (i in 1:length(mseg_pos)) {
-    merged_segments_pos[[i]] <- data.table::as.data.table(mseg_pos[[i]])
-    merged_segments_pos[[i]]$unique_id <- names(mseg_pos[i])
-    merged_segments_pos[[i]]$sv_type <- "positive"
   }
-  merged_segments_pos_table <- data.table::rbindlist(merged_segments_pos)
-  merged_segments_pos_table <- merged_segments_pos_table[width >= size_cutoff][, c(1:3, 6:7)]
-
-  # 3.3. join negative and positive tables
-
-  merged_segments <- rbind(merged_segments_neg_table, merged_segments_pos_table)
-  names(merged_segments) <- c("chromosome", names(merged_segments)[-1])
-
-  rm(merged_segments_neg, merged_segments_neg_table, merged_segments_pos, merged_segments_pos_table,
-     mseg_nb_neg_dt, mseg_nb_pos_dt, mseg_neg, mseg_pos, seg_neg, seg_pos, i, size_cutoff)
-
-  # 3.4. join merged segments with samples
-  mseg_final <- plyr::join(merged_segments, smp)
-  mseg_final <- mseg_final[, c(6:8, 4, 9, 1:3, 5)]
-
-  rm(merged_segments, smp)
-
-  # 4. Save output ----
-
-  data.table::fwrite(mseg_final,
-                     paste0(output_path, "results/merged_segments/", array, "_merged_segments.txt"),
-                     quote = FALSE, row.names = FALSE, sep = "\t")
 
-  rm(mseg_final, array, raw_seg_file, output_path)
+  return(final_mseg)
 
 }
diff --git a/R/normalise_lr.R b/misc/normalise_lr.R
similarity index 100%
rename from R/normalise_lr.R
rename to misc/normalise_lr.R
diff --git a/R/regroup_data.R b/misc/regroup_data.R
similarity index 100%
rename from R/regroup_data.R
rename to misc/regroup_data.R
diff --git a/R/summarise_segments.R b/misc/summarise_segments.R
similarity index 100%
rename from R/summarise_segments.R
rename to misc/summarise_segments.R
diff --git a/R/transform_lr.R b/misc/transform_lr.R
similarity index 100%
rename from R/transform_lr.R
rename to misc/transform_lr.R
-- 
GitLab


From 618afc246512717ce81adecdad98463490703789 Mon Sep 17 00:00:00 2001
From: Jonathan Kitt <jonathan.kitt@inrae.fr>
Date: Mon, 27 Jan 2025 15:42:28 +0100
Subject: [PATCH 2/4] replace normalise_lr() and transform_lr() by
 standardise_lr()

---
 NAMESPACE             |  1 +
 R/calculate_lr.R      | 14 ++--------
 R/standardise_lr.R    | 62 +++++++++++++++++++++++++++++++++++++++++++
 man/calculate_lr.Rd   |  6 ++---
 man/standardise_lr.Rd | 26 ++++++++++++++++++
 5 files changed, 93 insertions(+), 16 deletions(-)
 create mode 100644 R/standardise_lr.R
 create mode 100644 man/standardise_lr.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 519854c..18e70c1 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -2,6 +2,7 @@
 
 export(calculate_lr)
 export(read_axiom)
+export(standardise_lr)
 import(dplyr)
 import(readr)
 import(stringr)
diff --git a/R/calculate_lr.R b/R/calculate_lr.R
index 076e8ef..43d1da1 100644
--- a/R/calculate_lr.R
+++ b/R/calculate_lr.R
@@ -1,7 +1,6 @@
 #' Calculate lr values from summary file
 #'
 #' @param summary_file imported summary file (usually named 'summary')
-#' @param format "wide" or "long" table (defaults to "long")
 #'
 #' @import dplyr
 #' @import tidyr
@@ -11,10 +10,10 @@
 #'
 #' @examples
 #' \dontrun{
-#' calculate_lr(summary_file, format = "long")
+#' calculate_lr(summary_file)
 #' }
 
-calculate_lr <- function(summary_file, format = "long") {
+calculate_lr <- function(summary_file) {
 
   # Set NULL variables
 
@@ -47,15 +46,6 @@ calculate_lr <- function(summary_file, format = "long") {
     dplyr::mutate(lr = log2(sqrt(signal_a^2 + signal_b^2)),
                   .keep = "unused")
 
-  # If format is set to "wide", transform the table
-  if (format == "wide") {
-    message("Pivoting output table to wide format")
-    lr <- lr |>
-      tidyr::pivot_wider(id_cols = probeset_id,
-                         names_from = file_name,
-                         values_from = lr)
-  }
-
   # Assign to Global Environment
   assign(x = "lr", value = lr, pos = ".GlobalEnv")
 
diff --git a/R/standardise_lr.R b/R/standardise_lr.R
new file mode 100644
index 0000000..31c27a5
--- /dev/null
+++ b/R/standardise_lr.R
@@ -0,0 +1,62 @@
+#' Standardise lr values
+#'
+#' @param lr lr values
+#' @param method how to perform standardisation ("ref", "mean", or "both", defaults to "mean")
+#' @param ref name of the sample to use for standardisation (must be defined if method is "ref" or "both", defaults to NULL)
+#'
+#' @return a [tibble()]
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#' standardise_lr(lr, method = "mean", ref = NULL)
+#' }
+
+standardise_lr <- function(lr, method = "mean", ref = NULL) {
+
+  # Set NULL variables
+
+  file_name <- probeset_id <- NULL
+
+  # If method = "ref" or "both", check if ref is defined
+  if(method  %in% c("ref", "both")){
+    print("Checking if reference sample is defined")
+    if(is.null(ref)){
+      stop("Reference sample is not defined")
+    }
+    if(!is.null(ref) & !ref %in% names(calls)) {
+      stop("Reference sample doesn't exist in the list of samples")
+    }
+  }
+
+  # If method = "mean", calculate lr_st_mean
+  if (method == "mean") {
+    lr <- lr |>
+      dplyr::mutate(lr_mean = mean(lr, na.rm = TRUE),
+                    .by = probeset_id) |>
+      dplyr::mutate(lr_st_mean = lr - lr_mean) |>
+      dplyr::select(-lr_mean)
+  }
+
+  # If method = "ref", calculate lr_st_ref
+  if (method == "ref") {
+    lr <- lr |>
+      dplyr::mutate(lr_st_ref = lr - lr[file_name == ref],
+                    .by = probeset_id)
+  }
+
+  # If method = "both", calculate lr_st_ref & lr_st_mean
+  if (method == "both") {
+    lr <- lr |>
+      dplyr::mutate(lr_st_ref = lr - lr[file_name == ref],
+                    lr_mean = mean(lr, na.rm = TRUE),
+                    .by = probeset_id) |>
+      dplyr::mutate(lr_st_mean = lr - lr_mean,
+                    .before = lr_st_ref) |>
+      dplyr::select(-lr_mean)
+  }
+
+  # Assign to Global Environment
+  assign(x = "lr", value = lr, pos = ".GlobalEnv")
+
+}
diff --git a/man/calculate_lr.Rd b/man/calculate_lr.Rd
index 38748af..d9a2191 100644
--- a/man/calculate_lr.Rd
+++ b/man/calculate_lr.Rd
@@ -4,12 +4,10 @@
 \alias{calculate_lr}
 \title{Calculate lr values from summary file}
 \usage{
-calculate_lr(summary_file, format = "long")
+calculate_lr(summary_file)
 }
 \arguments{
 \item{summary_file}{imported summary file (usually named 'summary')}
-
-\item{format}{"wide" or "long" table (defaults to "long")}
 }
 \value{
 A \code{\link[=tibble]{tibble()}}.
@@ -19,6 +17,6 @@ Calculate lr values from summary file
 }
 \examples{
 \dontrun{
-calculate_lr(summary_file, format = "long")
+calculate_lr(summary_file)
 }
 }
diff --git a/man/standardise_lr.Rd b/man/standardise_lr.Rd
new file mode 100644
index 0000000..48f2038
--- /dev/null
+++ b/man/standardise_lr.Rd
@@ -0,0 +1,26 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/standardise_lr.R
+\name{standardise_lr}
+\alias{standardise_lr}
+\title{Standardise lr values}
+\usage{
+standardise_lr(lr, method = "mean", ref = NULL)
+}
+\arguments{
+\item{lr}{lr values}
+
+\item{method}{how to perform standardisation ("ref", "mean", or "both", defaults to "mean")}
+
+\item{ref}{name of the sample to use for standardisation (must be defined if method is "ref" or "both", defaults to NULL)}
+}
+\value{
+a \code{\link[=tibble]{tibble()}}
+}
+\description{
+Standardise lr values
+}
+\examples{
+\dontrun{
+standardise_lr(lr, method = "mean", ref = NULL)
+}
+}
-- 
GitLab


From 469889dd0a8fcbde11ae38dc27e2bee426c16cb0 Mon Sep 17 00:00:00 2001
From: Jonathan Kitt <jonathan.kitt@inrae.fr>
Date: Mon, 27 Jan 2025 15:45:30 +0100
Subject: [PATCH 3/4] add standardize_lr() function

---
 NAMESPACE             |  1 +
 R/standardize_lr.R    | 62 +++++++++++++++++++++++++++++++++++++++++++
 man/standardize_lr.Rd | 26 ++++++++++++++++++
 3 files changed, 89 insertions(+)
 create mode 100644 R/standardize_lr.R
 create mode 100644 man/standardize_lr.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 18e70c1..1ae3984 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -3,6 +3,7 @@
 export(calculate_lr)
 export(read_axiom)
 export(standardise_lr)
+export(standardize_lr)
 import(dplyr)
 import(readr)
 import(stringr)
diff --git a/R/standardize_lr.R b/R/standardize_lr.R
new file mode 100644
index 0000000..a4477d7
--- /dev/null
+++ b/R/standardize_lr.R
@@ -0,0 +1,62 @@
+#' Standardize lr values
+#'
+#' @param lr lr values
+#' @param method how to perform standardization ("ref", "mean", or "both", defaults to "mean")
+#' @param ref name of the sample to use for standardization (must be defined if method is "ref" or "both", defaults to NULL)
+#'
+#' @return a [tibble()]
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#' standardize_lr(lr, method = "mean", ref = NULL)
+#' }
+
+standardize_lr <- function(lr, method = "mean", ref = NULL) {
+
+  # Set NULL variables
+
+  file_name <- probeset_id <- NULL
+
+  # If method = "ref" or "both", check if ref is defined
+  if(method  %in% c("ref", "both")){
+    print("Checking if reference sample is defined")
+    if(is.null(ref)){
+      stop("Reference sample is not defined")
+    }
+    if(!is.null(ref) & !ref %in% names(calls)) {
+      stop("Reference sample doesn't exist in the list of samples")
+    }
+  }
+
+  # If method = "mean", calculate lr_st_mean
+  if (method == "mean") {
+    lr <- lr |>
+      dplyr::mutate(lr_mean = mean(lr, na.rm = TRUE),
+                    .by = probeset_id) |>
+      dplyr::mutate(lr_st_mean = lr - lr_mean) |>
+      dplyr::select(-lr_mean)
+  }
+
+  # If method = "ref", calculate lr_st_ref
+  if (method == "ref") {
+    lr <- lr |>
+      dplyr::mutate(lr_st_ref = lr - lr[file_name == ref],
+                    .by = probeset_id)
+  }
+
+  # If method = "both", calculate lr_st_ref & lr_st_mean
+  if (method == "both") {
+    lr <- lr |>
+      dplyr::mutate(lr_st_ref = lr - lr[file_name == ref],
+                    lr_mean = mean(lr, na.rm = TRUE),
+                    .by = probeset_id) |>
+      dplyr::mutate(lr_st_mean = lr - lr_mean,
+                    .before = lr_st_ref) |>
+      dplyr::select(-lr_mean)
+  }
+
+  # Assign to Global Environment
+  assign(x = "lr", value = lr, pos = ".GlobalEnv")
+
+}
diff --git a/man/standardize_lr.Rd b/man/standardize_lr.Rd
new file mode 100644
index 0000000..8792558
--- /dev/null
+++ b/man/standardize_lr.Rd
@@ -0,0 +1,26 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/standardize_lr.R
+\name{standardize_lr}
+\alias{standardize_lr}
+\title{Standardize lr values}
+\usage{
+standardize_lr(lr, method = "mean", ref = NULL)
+}
+\arguments{
+\item{lr}{lr values}
+
+\item{method}{how to perform standardization ("ref", "mean", or "both", defaults to "mean")}
+
+\item{ref}{name of the sample to use for standardization (must be defined if method is "ref" or "both", defaults to NULL)}
+}
+\value{
+a \code{\link[=tibble]{tibble()}}
+}
+\description{
+Standardize lr values
+}
+\examples{
+\dontrun{
+standardize_lr(lr, method = "mean", ref = NULL)
+}
+}
-- 
GitLab


From 8e1bdf67eda8f8f08433ac8e21905840e335d3e2 Mon Sep 17 00:00:00 2001
From: Jonathan Kitt <jonathan.kitt@inrae.fr>
Date: Mon, 27 Jan 2025 16:24:41 +0100
Subject: [PATCH 4/4] fix bug in standardise_lr()

---
 R/standardise_lr.R | 2 +-
 R/standardize_lr.R | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/standardise_lr.R b/R/standardise_lr.R
index 31c27a5..f7b3495 100644
--- a/R/standardise_lr.R
+++ b/R/standardise_lr.R
@@ -24,7 +24,7 @@ standardise_lr <- function(lr, method = "mean", ref = NULL) {
     if(is.null(ref)){
       stop("Reference sample is not defined")
     }
-    if(!is.null(ref) & !ref %in% names(calls)) {
+    if(!is.null(ref) & !ref %in% unique(lr$file_name)) {
       stop("Reference sample doesn't exist in the list of samples")
     }
   }
diff --git a/R/standardize_lr.R b/R/standardize_lr.R
index a4477d7..fa780b3 100644
--- a/R/standardize_lr.R
+++ b/R/standardize_lr.R
@@ -24,7 +24,7 @@ standardize_lr <- function(lr, method = "mean", ref = NULL) {
     if(is.null(ref)){
       stop("Reference sample is not defined")
     }
-    if(!is.null(ref) & !ref %in% names(calls)) {
+    if(!is.null(ref) & !ref %in% unique(lr$file_name)) {
       stop("Reference sample doesn't exist in the list of samples")
     }
   }
-- 
GitLab