From c3e216fb2b368faebccfc255719bdc4271576809 Mon Sep 17 00:00:00 2001 From: Jonathan Kitt <jonathan.kitt@inrae.fr> Date: Mon, 27 Jan 2025 13:33:01 +0100 Subject: [PATCH 1/4] move functions to misc/ --- NAMESPACE | 14 -- R/merge_segments.R | 95 ------------ man/calculate_maf.Rd | 22 --- man/count_alleles.Rd | 22 --- man/count_confidences.Rd | 24 --- man/extract_ranges.Rd | 22 --- man/find_cpts.Rd | 46 ------ man/merge_segments.Rd | 24 --- man/normalise_lr.Rd | 24 --- man/regroup_data.Rd | 24 --- man/summarise_segments.Rd | 26 ---- man/transform_lr.Rd | 26 ---- {R => misc}/calculate_maf.R | 0 {R => misc}/count_alleles.R | 0 {R => misc}/count_confidences.R | 0 {R => misc}/extract_ranges.R | 0 {R => misc}/find_cpts.R | 0 misc/merge_segments.R | 257 +++++++++---------------------- {R => misc}/normalise_lr.R | 0 {R => misc}/regroup_data.R | 0 {R => misc}/summarise_segments.R | 0 {R => misc}/transform_lr.R | 0 22 files changed, 77 insertions(+), 549 deletions(-) delete mode 100644 R/merge_segments.R delete mode 100644 man/calculate_maf.Rd delete mode 100644 man/count_alleles.Rd delete mode 100644 man/count_confidences.Rd delete mode 100644 man/extract_ranges.Rd delete mode 100644 man/find_cpts.Rd delete mode 100644 man/merge_segments.Rd delete mode 100644 man/normalise_lr.Rd delete mode 100644 man/regroup_data.Rd delete mode 100644 man/summarise_segments.Rd delete mode 100644 man/transform_lr.Rd rename {R => misc}/calculate_maf.R (100%) rename {R => misc}/count_alleles.R (100%) rename {R => misc}/count_confidences.R (100%) rename {R => misc}/extract_ranges.R (100%) rename {R => misc}/find_cpts.R (100%) rename {R => misc}/normalise_lr.R (100%) rename {R => misc}/regroup_data.R (100%) rename {R => misc}/summarise_segments.R (100%) rename {R => misc}/transform_lr.R (100%) diff --git a/NAMESPACE b/NAMESPACE index 23a2879..519854c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,22 +1,8 @@ # Generated by roxygen2: do not edit by hand export(calculate_lr) -export(calculate_maf) -export(count_alleles) -export(count_confidences) -export(extract_ranges) -export(find_cpts) -export(merge_segments) -export(normalise_lr) export(read_axiom) -export(regroup_data) -export(summarise_segments) -export(transform_lr) -import(GenomicRanges) -import(changepoint) import(dplyr) -import(plyranges) -import(purrr) import(readr) import(stringr) import(tidyr) diff --git a/R/merge_segments.R b/R/merge_segments.R deleted file mode 100644 index 6ccd03f..0000000 --- a/R/merge_segments.R +++ /dev/null @@ -1,95 +0,0 @@ -#' Merge consecutive segments and recalculate parameters -#' -#' @param segments_file segments file -#' @param regrouped_data_file lrn & calls file -#' -#' @import dplyr -#' @import GenomicRanges -#' -#' @return A [tibble()]. -#' @export -#' -#' @examples -#' \dontrun{ -#' merge_segments(segments_file, regrouped_data_file) -#' } - -merge_segments <- function(segments_file, regrouped_data_file) { - - # Set NULL variables - - chromosome <- file_name <- group_name <- high_lrn_count <- mseg_end <- - mseg_start <- position <- seg_nb <- value <- NULL - - # Merge consecutive segments - merged_segments <- segments_file %>% - dplyr::select(file_name, chromosome, start = seg_nb, end = seg_nb) %>% - GenomicRanges::makeGRangesListFromDataFrame(split.field = "file_name") %>% - GenomicRanges::reduce() %>% - tibble::as_tibble() %>% - dplyr::select(file_name = group_name, - chromosome = seqnames, - mseg_start = start, - mseg_end = end) - - # Keep non-merged segments - same_segments <- merged_segments %>% - dplyr::filter(mseg_start == mseg_end) %>% - dplyr::select(file_name, chromosome, seg_nb = mseg_start) %>% - dplyr::left_join(segments_file) %>% - dplyr::select(file_name, chromosome, start:high_lrn_count) - - if (nrow(same_segments) != nrow(merged_segments)) { - - # Extract merged segments and recalculate parameters - diff_segments <- merged_segments %>% - dplyr::filter(mseg_start != mseg_end) %>% - tidyr::pivot_longer(!c(file_name, chromosome)) %>% - dplyr::select(file_name, chromosome, seg_nb = value) %>% - dplyr::left_join(segments_file) %>% - dplyr::select(file_name, chromosome, start:end) %>% - dplyr::group_by(file_name, chromosome) %>% - dplyr::mutate(start = min(start), - end = max(end)) %>% - dplyr::slice(1) - - lrn_means <- list() - snp_counts <- list() - otv_counts <- list() - low_lrn_counts <- list() - high_lrn_counts <- list() - - for (i in 1:nrow(diff_segments)) { - d1 <- regrouped_data_file %>% - dplyr::filter(file_name == diff_segments$file_name[i] & - chromosome == diff_segments$chromosome[i] & - position >= diff_segments$start[i] & - position <= diff_segments$end[i]) - - lrn_means[[i]] <- mean(d1$lrn) - snp_counts[[i]] <- nrow(d1) - otv_counts[[i]] <- length(which(d1$call == "-2")) - low_lrn_counts[[i]] <- length(which(d1$lrn <= -1)) - high_lrn_counts[[i]] <- length(which(d1$lrn >= 1)) - } - - diff_segments$lrn_mean <- unlist(lrn_means) - diff_segments$snp_count <- unlist(snp_counts) - diff_segments$otv_count <- unlist(otv_counts) - diff_segments$low_lrn_count <- unlist(low_lrn_counts) - diff_segments$high_lrn_count <- unlist(high_lrn_counts) - - # rbind same_segments & diff_segments + reorder - final_mseg <- rbind(same_segments, diff_segments) - - } - - if (nrow(same_segments) == nrow(merged_segments)) { - - final_mseg <- same_segments - - } - - return(final_mseg) - -} diff --git a/man/calculate_maf.Rd b/man/calculate_maf.Rd deleted file mode 100644 index 0b7ba2c..0000000 --- a/man/calculate_maf.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/calculate_maf.R -\name{calculate_maf} -\alias{calculate_maf} -\title{Calculate MAF from allele counts} -\usage{ -calculate_maf(allele_count) -} -\arguments{ -\item{allele_count}{allele count file} -} -\value{ -A \code{\link[=tibble]{tibble()}}. -} -\description{ -Calculate MAF from allele counts -} -\examples{ -\dontrun{ -calculate_maf(allele_count) -} -} diff --git a/man/count_alleles.Rd b/man/count_alleles.Rd deleted file mode 100644 index 647e301..0000000 --- a/man/count_alleles.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/count_alleles.R -\name{count_alleles} -\alias{count_alleles} -\title{Count alleles for each SNP in AximGT1.calls.txt data} -\usage{ -count_alleles(calls_file) -} -\arguments{ -\item{calls_file}{calls file} -} -\value{ -A \code{\link[=tibble]{tibble()}}. -} -\description{ -Count alleles for each SNP in AximGT1.calls.txt data -} -\examples{ -\dontrun{ -count_alleles(calls_file) -} -} diff --git a/man/count_confidences.Rd b/man/count_confidences.Rd deleted file mode 100644 index 0a1210a..0000000 --- a/man/count_confidences.Rd +++ /dev/null @@ -1,24 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/count_confidences.R -\name{count_confidences} -\alias{count_confidences} -\title{Count number of values above confidence score threshold for each SNP in AxiomGT1.confidences.txt data} -\usage{ -count_confidences(confidences_file, threshold = 0.15) -} -\arguments{ -\item{confidences_file}{confidences file} - -\item{threshold}{confidence score threshold (defaults to 0.15)} -} -\value{ -A \code{\link[=tibble]{tibble()}}. -} -\description{ -Count number of values above confidence score threshold for each SNP in AxiomGT1.confidences.txt data -} -\examples{ -\dontrun{ -count_confidences(confidences_file, threshold) -} -} diff --git a/man/extract_ranges.Rd b/man/extract_ranges.Rd deleted file mode 100644 index 555151e..0000000 --- a/man/extract_ranges.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/extract_ranges.R -\name{extract_ranges} -\alias{extract_ranges} -\title{Extract ranges} -\usage{ -extract_ranges(snps) -} -\arguments{ -\item{snps}{list of SNPs} -} -\value{ -a table with minimum and maximum SNP positions for each chromosome -} -\description{ -Extract ranges -} -\examples{ -\dontrun{ -extract_ranges(snps) -} -} diff --git a/man/find_cpts.Rd b/man/find_cpts.Rd deleted file mode 100644 index a518517..0000000 --- a/man/find_cpts.Rd +++ /dev/null @@ -1,46 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/find_cpts.R -\name{find_cpts} -\alias{find_cpts} -\title{Find changepoints in lrn data} -\usage{ -find_cpts( - lr_st, - calc = "mean", - var = "lr_st_ref", - method = "BinSeg", - penalty = "MBIC", - pen_value = 0, - max_cpts = 3, - count_otvs = TRUE -) -} -\arguments{ -\item{lr_st}{standardized lr values} - -\item{calc}{changepoints will be calculated w.r.t. this value ("mean", "var" or "mean_var")} - -\item{var}{variable to be used for changepoint detection} - -\item{method}{changepoint detection method (defaults to "BinSeg")} - -\item{penalty}{changepoint detection penalty (defaults to "MBIC")} - -\item{pen_value}{penalty value (defaults to 0)} - -\item{max_cpts}{maximum number of changepoints (defaults to 3)} - -\item{count_otvs}{whether or not to add number of OTVs in detected segments (defaults to TRUE)} -} -\value{ -a \code{\link[=tibble]{tibble()}} -} -\description{ -Find changepoints in lrn data -} -\examples{ -\dontrun{ -find_cpts(lr_st, calc = "mean", var = "lr_st_ref", method = "BinSeg", -penalty = "MBIC", pen_value = 0, max_cpts = 3, count_otvs = TRUE) -} -} diff --git a/man/merge_segments.Rd b/man/merge_segments.Rd deleted file mode 100644 index f4276c8..0000000 --- a/man/merge_segments.Rd +++ /dev/null @@ -1,24 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/merge_segments.R -\name{merge_segments} -\alias{merge_segments} -\title{Merge consecutive segments and recalculate parameters} -\usage{ -merge_segments(segments_file, regrouped_data_file) -} -\arguments{ -\item{segments_file}{segments file} - -\item{regrouped_data_file}{lrn & calls file} -} -\value{ -A \code{\link[=tibble]{tibble()}}. -} -\description{ -Merge consecutive segments and recalculate parameters -} -\examples{ -\dontrun{ -merge_segments(segments_file, regrouped_data_file) -} -} diff --git a/man/normalise_lr.Rd b/man/normalise_lr.Rd deleted file mode 100644 index 8c07644..0000000 --- a/man/normalise_lr.Rd +++ /dev/null @@ -1,24 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/normalise_lr.R -\name{normalise_lr} -\alias{normalise_lr} -\title{Normalise lr values} -\usage{ -normalise_lr(lr, norm) -} -\arguments{ -\item{lr}{lr values} - -\item{norm}{sample used for normalisation} -} -\value{ -a \code{\link[=tibble]{tibble()}} -} -\description{ -Normalise lr values -} -\examples{ -\dontrun{ -normalise_lr(lr, norm) -} -} diff --git a/man/regroup_data.Rd b/man/regroup_data.Rd deleted file mode 100644 index 48b1de2..0000000 --- a/man/regroup_data.Rd +++ /dev/null @@ -1,24 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/regroup_data.R -\name{regroup_data} -\alias{regroup_data} -\title{Regroup lrn and calls data} -\usage{ -regroup_data(lrn_file, calls_file) -} -\arguments{ -\item{lrn_file}{lrn file} - -\item{calls_file}{calls file} -} -\value{ -A \code{\link[=tibble]{tibble()}}. -} -\description{ -Regroup lrn and calls data -} -\examples{ -\dontrun{ -regroup_data(lrn_file, calls_file) -} -} diff --git a/man/summarise_segments.Rd b/man/summarise_segments.Rd deleted file mode 100644 index 7644dd7..0000000 --- a/man/summarise_segments.Rd +++ /dev/null @@ -1,26 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarise_segments.R -\name{summarise_segments} -\alias{summarise_segments} -\title{Summarise segments : count OTVs} -\usage{ -summarise_segments(raw_segments_file, regrouped_data_file, snps_file) -} -\arguments{ -\item{raw_segments_file}{raw segments file} - -\item{regrouped_data_file}{file containing lrn values and calls} - -\item{snps_file}{list of SNPs} -} -\value{ -A \code{\link[=tibble]{tibble()}}. -} -\description{ -Summarise segments : count OTVs -} -\examples{ -\dontrun{ -summarise_segments(raw_segments_file, regrouped_data_file, snps_file) -} -} diff --git a/man/transform_lr.Rd b/man/transform_lr.Rd deleted file mode 100644 index 384864a..0000000 --- a/man/transform_lr.Rd +++ /dev/null @@ -1,26 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/transform_lr.R -\name{transform_lr} -\alias{transform_lr} -\title{Transform lr values} -\usage{ -transform_lr(lr, method = "ref", ref) -} -\arguments{ -\item{lr}{lr values} - -\item{method}{how to perform standardization ("ref", "mean", or "both")} - -\item{ref}{sample used for standardization} -} -\value{ -a \code{\link[=tibble]{tibble()}} -} -\description{ -Transform lr values -} -\examples{ -\dontrun{ -transform_lr(lr, method = "ref", ref) -} -} diff --git a/R/calculate_maf.R b/misc/calculate_maf.R similarity index 100% rename from R/calculate_maf.R rename to misc/calculate_maf.R diff --git a/R/count_alleles.R b/misc/count_alleles.R similarity index 100% rename from R/count_alleles.R rename to misc/count_alleles.R diff --git a/R/count_confidences.R b/misc/count_confidences.R similarity index 100% rename from R/count_confidences.R rename to misc/count_confidences.R diff --git a/R/extract_ranges.R b/misc/extract_ranges.R similarity index 100% rename from R/extract_ranges.R rename to misc/extract_ranges.R diff --git a/R/find_cpts.R b/misc/find_cpts.R similarity index 100% rename from R/find_cpts.R rename to misc/find_cpts.R diff --git a/misc/merge_segments.R b/misc/merge_segments.R index 249ce8a..6ccd03f 100644 --- a/misc/merge_segments.R +++ b/misc/merge_segments.R @@ -1,198 +1,95 @@ -#' Filter and merge segments obtained from changepoint detection +#' Merge consecutive segments and recalculate parameters #' -#' @description -#' This function filters and merges segments detected using the \code{detect_cpts()} function. +#' @param segments_file segments file +#' @param regrouped_data_file lrn & calls file #' -#' @import data.table -#' @importFrom plyr join -#' @importFrom GenomicRanges makeGRangesListFromDataFrame reduce -#' -#' @param array name of the array to be analyzed. -#' @param raw_seg_file path to the \code{raw_segments} file. -#' @param mean_cutoff threshold lrn_mean value (absolute value) used to determine whether a segment should -#' be kept (defaults to 0.25). -#' @param size_cutoff threshold segment size value used to determine whether a segment should be kept (defaults to 500kb). -#' @param output_path where the results will be saved -#' -#' @details -#' The merging is done in two consecutive steps : -#' - merge consecutive segments with a lrn_mean value regardless of gaps -#' - merge segments separated by another segment with a size inferior to size_cutoff -#' -#' @usage merge_segments(array, raw_seg_file, mean_cutoff = 0.25, -#' size_cutoff = 500e3, output_path) -#' -#' @return A \code{merged_segments} data.table containing the following columns : -#' Returns a table containing the following columns : -#' - species -#' - array_type -#' - array_name -#' - unique_id -#' - sample_name -#' - chromosome -#' - start -#' - end -#' - sv_type +#' @import dplyr +#' @import GenomicRanges #' +#' @return A [tibble()]. #' @export #' #' @examples #' \dontrun{ -#' merge_segments(array, raw_seg_file, mean_cutoff, size_cutoff, output_path) +#' merge_segments(segments_file, regrouped_data_file) #' } -merge_segments <- function(array, raw_seg_file, mean_cutoff = 0.25, - size_cutoff = 500e3, output_path) { - - lrn_mean <- end <- start <- nb_cpts <- seg_nb <- - . <- array_name <- analysis_group <- unique_id <- - chromosome <- head <- size <- width <- lrn_cutoff <- - raw_seg_file <- NULL - - # Check arguments ---- - - if (missing(array)) stop("array is required") - if (missing(raw_seg_file)) stop("raw_seg_file is required") - if (missing(output_path)) stop("output_path is required") - - # Create folders for saving results ---- - - if (!dir.exists(paste0(output_path, "results/"))) { - dir.create(paste0(output_path, "results/")) - } - - if (!dir.exists(paste0(output_path, "results/merged_segments"))) { - dir.create(paste0(output_path, "results/merged_segments/")) - } - - # 1. Import data and prepare files ---- - - # 1.1. import segmentation file - raw_seg <- data.table::fread(raw_seg_file) - rm(raw_seg_file) - - # 1.3. extract list of samples - smp <- raw_seg[, 1:5] - smp <- smp[!duplicated(smp), ] - - # 1.4. split segments into seg_neg & seg_pos + filter - seg_neg <- raw_seg[lrn_mean <= -mean_cutoff][, c(4, 7, 9:11)] - seg_pos <- raw_seg[lrn_mean >= mean_cutoff][, c(4, 7, 9:11)] - rm(raw_seg) - - # 2. Merge consecutive segments ---- - - # 2.1. create GRangesLists - - seg_nb_neg <- seg_neg[, c(1:3, 3)] - seg_nb_pos <- seg_pos[, c(1:3, 3)] - - names(seg_nb_neg) <- c("unique_id", "chromosome", "start", "end") - names(seg_nb_pos) <- c("unique_id", "chromosome", "start", "end") +merge_segments <- function(segments_file, regrouped_data_file) { + + # Set NULL variables + + chromosome <- file_name <- group_name <- high_lrn_count <- mseg_end <- + mseg_start <- position <- seg_nb <- value <- NULL + + # Merge consecutive segments + merged_segments <- segments_file %>% + dplyr::select(file_name, chromosome, start = seg_nb, end = seg_nb) %>% + GenomicRanges::makeGRangesListFromDataFrame(split.field = "file_name") %>% + GenomicRanges::reduce() %>% + tibble::as_tibble() %>% + dplyr::select(file_name = group_name, + chromosome = seqnames, + mseg_start = start, + mseg_end = end) + + # Keep non-merged segments + same_segments <- merged_segments %>% + dplyr::filter(mseg_start == mseg_end) %>% + dplyr::select(file_name, chromosome, seg_nb = mseg_start) %>% + dplyr::left_join(segments_file) %>% + dplyr::select(file_name, chromosome, start:high_lrn_count) + + if (nrow(same_segments) != nrow(merged_segments)) { + + # Extract merged segments and recalculate parameters + diff_segments <- merged_segments %>% + dplyr::filter(mseg_start != mseg_end) %>% + tidyr::pivot_longer(!c(file_name, chromosome)) %>% + dplyr::select(file_name, chromosome, seg_nb = value) %>% + dplyr::left_join(segments_file) %>% + dplyr::select(file_name, chromosome, start:end) %>% + dplyr::group_by(file_name, chromosome) %>% + dplyr::mutate(start = min(start), + end = max(end)) %>% + dplyr::slice(1) + + lrn_means <- list() + snp_counts <- list() + otv_counts <- list() + low_lrn_counts <- list() + high_lrn_counts <- list() + + for (i in 1:nrow(diff_segments)) { + d1 <- regrouped_data_file %>% + dplyr::filter(file_name == diff_segments$file_name[i] & + chromosome == diff_segments$chromosome[i] & + position >= diff_segments$start[i] & + position <= diff_segments$end[i]) + + lrn_means[[i]] <- mean(d1$lrn) + snp_counts[[i]] <- nrow(d1) + otv_counts[[i]] <- length(which(d1$call == "-2")) + low_lrn_counts[[i]] <- length(which(d1$lrn <= -1)) + high_lrn_counts[[i]] <- length(which(d1$lrn >= 1)) + } + + diff_segments$lrn_mean <- unlist(lrn_means) + diff_segments$snp_count <- unlist(snp_counts) + diff_segments$otv_count <- unlist(otv_counts) + diff_segments$low_lrn_count <- unlist(low_lrn_counts) + diff_segments$high_lrn_count <- unlist(high_lrn_counts) + + # rbind same_segments & diff_segments + reorder + final_mseg <- rbind(same_segments, diff_segments) - seg_nb_neg <- GenomicRanges::makeGRangesListFromDataFrame(df = seg_nb_neg, split.field = "unique_id") - seg_nb_pos <- GenomicRanges::makeGRangesListFromDataFrame(df = seg_nb_pos,split.field = "unique_id") - - # 2.2. reduce GRangesLists - - mseg_nb_neg <- GenomicRanges::reduce(seg_nb_neg) - mseg_nb_pos <- GenomicRanges::reduce(seg_nb_pos) - - # 2.3. create tables - - mseg_nb_neg_list <- list() - for (i in 1:length(mseg_nb_neg)) { - mseg_nb_neg_list[[i]] <- data.table::as.data.table(mseg_nb_neg[[i]]) - mseg_nb_neg_list[[i]]$unique_id <- names(mseg_nb_neg[i]) - } - mseg_nb_neg_tb <- data.table::rbindlist(mseg_nb_neg_list)[, c(1:3, 6)] - - mseg_nb_pos_list <- list() - for (i in 1:length(mseg_nb_pos)) { - mseg_nb_pos_list[[i]] <- data.table::as.data.table(mseg_nb_pos[[i]]) - mseg_nb_pos_list[[i]]$unique_id <- names(mseg_nb_pos[i]) - } - mseg_nb_pos_tb <- data.table::rbindlist(mseg_nb_pos_list)[, c(1:3, 6)] - - # 2.4. extract physical positions - - mseg_nb_neg <- list() - for (i in 1:nrow(mseg_nb_neg_tb)) { - d1 <- seg_neg[unique_id == mseg_nb_neg_tb$unique_id[i] & chromosome == mseg_nb_neg_tb$seqnames[i]] - mseg_nb_neg[[i]] <- data.table::data.table( - chromosome = unique(d1$chromosome), - unique_id = unique(d1$unique_id), - start = d1$start[d1$seg_nb == mseg_nb_neg_tb$start[i]], - end = d1$end[d1$seg_nb == mseg_nb_neg_tb$end[i]] - ) - } - mseg_nb_neg_dt <- data.table::rbindlist(mseg_nb_neg) - - mseg_nb_pos <- list() - for (i in 1:nrow(mseg_nb_pos_tb)) { - d1 <- seg_pos[unique_id == mseg_nb_pos_tb$unique_id[i] & chromosome == mseg_nb_pos_tb$seqnames[i]] - mseg_nb_pos[[i]] <- data.table::data.table( - chromosome = unique(d1$chromosome), - unique_id = unique(d1$unique_id), - start = d1$start[d1$seg_nb == mseg_nb_pos_tb$start[i]], - end = d1$end[d1$seg_nb == mseg_nb_pos_tb$end[i]] - ) } - mseg_nb_pos_dt <- data.table::rbindlist(mseg_nb_pos) - - rm(d1, mseg_nb_neg, mseg_nb_neg_tb, mseg_nb_pos, mseg_nb_pos_tb, - seg_nb_neg, seg_nb_pos, seg_neg, seg_pos, i, mean_cutoff, mseg_nb_neg_list, mseg_nb_pos_list) - - # 3. Merge segments according to gaps ---- - # 3.1. create GRangesLists + if (nrow(same_segments) == nrow(merged_segments)) { - seg_neg <- GenomicRanges::makeGRangesListFromDataFrame(df = mseg_nb_neg_dt, split.field = "unique_id") - seg_pos <- GenomicRanges::makeGRangesListFromDataFrame(df = mseg_nb_pos_dt, split.field = "unique_id") + final_mseg <- same_segments - # 3.2. reduce GRangesLists - - mseg_neg <- GenomicRanges::reduce(seg_neg, min.gapwidth = size_cutoff) - mseg_pos <- GenomicRanges::reduce(seg_pos, min.gapwidth = size_cutoff) - - merged_segments_neg <- list() - for (i in 1:length(mseg_neg)) { - merged_segments_neg[[i]] <- data.table::as.data.table(mseg_neg[[i]]) - merged_segments_neg[[i]]$unique_id <- names(mseg_neg[i]) - merged_segments_neg[[i]]$sv_type <- "negative" - } - merged_segments_neg_table <- data.table::rbindlist(merged_segments_neg) - merged_segments_neg_table <- merged_segments_neg_table[width >= size_cutoff][, c(1:3, 6:7)] - - merged_segments_pos <- list() - for (i in 1:length(mseg_pos)) { - merged_segments_pos[[i]] <- data.table::as.data.table(mseg_pos[[i]]) - merged_segments_pos[[i]]$unique_id <- names(mseg_pos[i]) - merged_segments_pos[[i]]$sv_type <- "positive" } - merged_segments_pos_table <- data.table::rbindlist(merged_segments_pos) - merged_segments_pos_table <- merged_segments_pos_table[width >= size_cutoff][, c(1:3, 6:7)] - - # 3.3. join negative and positive tables - - merged_segments <- rbind(merged_segments_neg_table, merged_segments_pos_table) - names(merged_segments) <- c("chromosome", names(merged_segments)[-1]) - - rm(merged_segments_neg, merged_segments_neg_table, merged_segments_pos, merged_segments_pos_table, - mseg_nb_neg_dt, mseg_nb_pos_dt, mseg_neg, mseg_pos, seg_neg, seg_pos, i, size_cutoff) - - # 3.4. join merged segments with samples - mseg_final <- plyr::join(merged_segments, smp) - mseg_final <- mseg_final[, c(6:8, 4, 9, 1:3, 5)] - - rm(merged_segments, smp) - - # 4. Save output ---- - - data.table::fwrite(mseg_final, - paste0(output_path, "results/merged_segments/", array, "_merged_segments.txt"), - quote = FALSE, row.names = FALSE, sep = "\t") - rm(mseg_final, array, raw_seg_file, output_path) + return(final_mseg) } diff --git a/R/normalise_lr.R b/misc/normalise_lr.R similarity index 100% rename from R/normalise_lr.R rename to misc/normalise_lr.R diff --git a/R/regroup_data.R b/misc/regroup_data.R similarity index 100% rename from R/regroup_data.R rename to misc/regroup_data.R diff --git a/R/summarise_segments.R b/misc/summarise_segments.R similarity index 100% rename from R/summarise_segments.R rename to misc/summarise_segments.R diff --git a/R/transform_lr.R b/misc/transform_lr.R similarity index 100% rename from R/transform_lr.R rename to misc/transform_lr.R -- GitLab From 618afc246512717ce81adecdad98463490703789 Mon Sep 17 00:00:00 2001 From: Jonathan Kitt <jonathan.kitt@inrae.fr> Date: Mon, 27 Jan 2025 15:42:28 +0100 Subject: [PATCH 2/4] replace normalise_lr() and transform_lr() by standardise_lr() --- NAMESPACE | 1 + R/calculate_lr.R | 14 ++-------- R/standardise_lr.R | 62 +++++++++++++++++++++++++++++++++++++++++++ man/calculate_lr.Rd | 6 ++--- man/standardise_lr.Rd | 26 ++++++++++++++++++ 5 files changed, 93 insertions(+), 16 deletions(-) create mode 100644 R/standardise_lr.R create mode 100644 man/standardise_lr.Rd diff --git a/NAMESPACE b/NAMESPACE index 519854c..18e70c1 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,6 +2,7 @@ export(calculate_lr) export(read_axiom) +export(standardise_lr) import(dplyr) import(readr) import(stringr) diff --git a/R/calculate_lr.R b/R/calculate_lr.R index 076e8ef..43d1da1 100644 --- a/R/calculate_lr.R +++ b/R/calculate_lr.R @@ -1,7 +1,6 @@ #' Calculate lr values from summary file #' #' @param summary_file imported summary file (usually named 'summary') -#' @param format "wide" or "long" table (defaults to "long") #' #' @import dplyr #' @import tidyr @@ -11,10 +10,10 @@ #' #' @examples #' \dontrun{ -#' calculate_lr(summary_file, format = "long") +#' calculate_lr(summary_file) #' } -calculate_lr <- function(summary_file, format = "long") { +calculate_lr <- function(summary_file) { # Set NULL variables @@ -47,15 +46,6 @@ calculate_lr <- function(summary_file, format = "long") { dplyr::mutate(lr = log2(sqrt(signal_a^2 + signal_b^2)), .keep = "unused") - # If format is set to "wide", transform the table - if (format == "wide") { - message("Pivoting output table to wide format") - lr <- lr |> - tidyr::pivot_wider(id_cols = probeset_id, - names_from = file_name, - values_from = lr) - } - # Assign to Global Environment assign(x = "lr", value = lr, pos = ".GlobalEnv") diff --git a/R/standardise_lr.R b/R/standardise_lr.R new file mode 100644 index 0000000..31c27a5 --- /dev/null +++ b/R/standardise_lr.R @@ -0,0 +1,62 @@ +#' Standardise lr values +#' +#' @param lr lr values +#' @param method how to perform standardisation ("ref", "mean", or "both", defaults to "mean") +#' @param ref name of the sample to use for standardisation (must be defined if method is "ref" or "both", defaults to NULL) +#' +#' @return a [tibble()] +#' @export +#' +#' @examples +#' \dontrun{ +#' standardise_lr(lr, method = "mean", ref = NULL) +#' } + +standardise_lr <- function(lr, method = "mean", ref = NULL) { + + # Set NULL variables + + file_name <- probeset_id <- NULL + + # If method = "ref" or "both", check if ref is defined + if(method %in% c("ref", "both")){ + print("Checking if reference sample is defined") + if(is.null(ref)){ + stop("Reference sample is not defined") + } + if(!is.null(ref) & !ref %in% names(calls)) { + stop("Reference sample doesn't exist in the list of samples") + } + } + + # If method = "mean", calculate lr_st_mean + if (method == "mean") { + lr <- lr |> + dplyr::mutate(lr_mean = mean(lr, na.rm = TRUE), + .by = probeset_id) |> + dplyr::mutate(lr_st_mean = lr - lr_mean) |> + dplyr::select(-lr_mean) + } + + # If method = "ref", calculate lr_st_ref + if (method == "ref") { + lr <- lr |> + dplyr::mutate(lr_st_ref = lr - lr[file_name == ref], + .by = probeset_id) + } + + # If method = "both", calculate lr_st_ref & lr_st_mean + if (method == "both") { + lr <- lr |> + dplyr::mutate(lr_st_ref = lr - lr[file_name == ref], + lr_mean = mean(lr, na.rm = TRUE), + .by = probeset_id) |> + dplyr::mutate(lr_st_mean = lr - lr_mean, + .before = lr_st_ref) |> + dplyr::select(-lr_mean) + } + + # Assign to Global Environment + assign(x = "lr", value = lr, pos = ".GlobalEnv") + +} diff --git a/man/calculate_lr.Rd b/man/calculate_lr.Rd index 38748af..d9a2191 100644 --- a/man/calculate_lr.Rd +++ b/man/calculate_lr.Rd @@ -4,12 +4,10 @@ \alias{calculate_lr} \title{Calculate lr values from summary file} \usage{ -calculate_lr(summary_file, format = "long") +calculate_lr(summary_file) } \arguments{ \item{summary_file}{imported summary file (usually named 'summary')} - -\item{format}{"wide" or "long" table (defaults to "long")} } \value{ A \code{\link[=tibble]{tibble()}}. @@ -19,6 +17,6 @@ Calculate lr values from summary file } \examples{ \dontrun{ -calculate_lr(summary_file, format = "long") +calculate_lr(summary_file) } } diff --git a/man/standardise_lr.Rd b/man/standardise_lr.Rd new file mode 100644 index 0000000..48f2038 --- /dev/null +++ b/man/standardise_lr.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/standardise_lr.R +\name{standardise_lr} +\alias{standardise_lr} +\title{Standardise lr values} +\usage{ +standardise_lr(lr, method = "mean", ref = NULL) +} +\arguments{ +\item{lr}{lr values} + +\item{method}{how to perform standardisation ("ref", "mean", or "both", defaults to "mean")} + +\item{ref}{name of the sample to use for standardisation (must be defined if method is "ref" or "both", defaults to NULL)} +} +\value{ +a \code{\link[=tibble]{tibble()}} +} +\description{ +Standardise lr values +} +\examples{ +\dontrun{ +standardise_lr(lr, method = "mean", ref = NULL) +} +} -- GitLab From 469889dd0a8fcbde11ae38dc27e2bee426c16cb0 Mon Sep 17 00:00:00 2001 From: Jonathan Kitt <jonathan.kitt@inrae.fr> Date: Mon, 27 Jan 2025 15:45:30 +0100 Subject: [PATCH 3/4] add standardize_lr() function --- NAMESPACE | 1 + R/standardize_lr.R | 62 +++++++++++++++++++++++++++++++++++++++++++ man/standardize_lr.Rd | 26 ++++++++++++++++++ 3 files changed, 89 insertions(+) create mode 100644 R/standardize_lr.R create mode 100644 man/standardize_lr.Rd diff --git a/NAMESPACE b/NAMESPACE index 18e70c1..1ae3984 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,6 +3,7 @@ export(calculate_lr) export(read_axiom) export(standardise_lr) +export(standardize_lr) import(dplyr) import(readr) import(stringr) diff --git a/R/standardize_lr.R b/R/standardize_lr.R new file mode 100644 index 0000000..a4477d7 --- /dev/null +++ b/R/standardize_lr.R @@ -0,0 +1,62 @@ +#' Standardize lr values +#' +#' @param lr lr values +#' @param method how to perform standardization ("ref", "mean", or "both", defaults to "mean") +#' @param ref name of the sample to use for standardization (must be defined if method is "ref" or "both", defaults to NULL) +#' +#' @return a [tibble()] +#' @export +#' +#' @examples +#' \dontrun{ +#' standardize_lr(lr, method = "mean", ref = NULL) +#' } + +standardize_lr <- function(lr, method = "mean", ref = NULL) { + + # Set NULL variables + + file_name <- probeset_id <- NULL + + # If method = "ref" or "both", check if ref is defined + if(method %in% c("ref", "both")){ + print("Checking if reference sample is defined") + if(is.null(ref)){ + stop("Reference sample is not defined") + } + if(!is.null(ref) & !ref %in% names(calls)) { + stop("Reference sample doesn't exist in the list of samples") + } + } + + # If method = "mean", calculate lr_st_mean + if (method == "mean") { + lr <- lr |> + dplyr::mutate(lr_mean = mean(lr, na.rm = TRUE), + .by = probeset_id) |> + dplyr::mutate(lr_st_mean = lr - lr_mean) |> + dplyr::select(-lr_mean) + } + + # If method = "ref", calculate lr_st_ref + if (method == "ref") { + lr <- lr |> + dplyr::mutate(lr_st_ref = lr - lr[file_name == ref], + .by = probeset_id) + } + + # If method = "both", calculate lr_st_ref & lr_st_mean + if (method == "both") { + lr <- lr |> + dplyr::mutate(lr_st_ref = lr - lr[file_name == ref], + lr_mean = mean(lr, na.rm = TRUE), + .by = probeset_id) |> + dplyr::mutate(lr_st_mean = lr - lr_mean, + .before = lr_st_ref) |> + dplyr::select(-lr_mean) + } + + # Assign to Global Environment + assign(x = "lr", value = lr, pos = ".GlobalEnv") + +} diff --git a/man/standardize_lr.Rd b/man/standardize_lr.Rd new file mode 100644 index 0000000..8792558 --- /dev/null +++ b/man/standardize_lr.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/standardize_lr.R +\name{standardize_lr} +\alias{standardize_lr} +\title{Standardize lr values} +\usage{ +standardize_lr(lr, method = "mean", ref = NULL) +} +\arguments{ +\item{lr}{lr values} + +\item{method}{how to perform standardization ("ref", "mean", or "both", defaults to "mean")} + +\item{ref}{name of the sample to use for standardization (must be defined if method is "ref" or "both", defaults to NULL)} +} +\value{ +a \code{\link[=tibble]{tibble()}} +} +\description{ +Standardize lr values +} +\examples{ +\dontrun{ +standardize_lr(lr, method = "mean", ref = NULL) +} +} -- GitLab From 8e1bdf67eda8f8f08433ac8e21905840e335d3e2 Mon Sep 17 00:00:00 2001 From: Jonathan Kitt <jonathan.kitt@inrae.fr> Date: Mon, 27 Jan 2025 16:24:41 +0100 Subject: [PATCH 4/4] fix bug in standardise_lr() --- R/standardise_lr.R | 2 +- R/standardize_lr.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/standardise_lr.R b/R/standardise_lr.R index 31c27a5..f7b3495 100644 --- a/R/standardise_lr.R +++ b/R/standardise_lr.R @@ -24,7 +24,7 @@ standardise_lr <- function(lr, method = "mean", ref = NULL) { if(is.null(ref)){ stop("Reference sample is not defined") } - if(!is.null(ref) & !ref %in% names(calls)) { + if(!is.null(ref) & !ref %in% unique(lr$file_name)) { stop("Reference sample doesn't exist in the list of samples") } } diff --git a/R/standardize_lr.R b/R/standardize_lr.R index a4477d7..fa780b3 100644 --- a/R/standardize_lr.R +++ b/R/standardize_lr.R @@ -24,7 +24,7 @@ standardize_lr <- function(lr, method = "mean", ref = NULL) { if(is.null(ref)){ stop("Reference sample is not defined") } - if(!is.null(ref) & !ref %in% names(calls)) { + if(!is.null(ref) & !ref %in% unique(lr$file_name)) { stop("Reference sample doesn't exist in the list of samples") } } -- GitLab