diff --git a/.gitignore b/.gitignore index f7f2ef469c92ae120744e09ed58b10c34579b3a1..103f54c8529341bef7c08d7aa6f56d9d3cfbe88c 100644 --- a/.gitignore +++ b/.gitignore @@ -48,6 +48,7 @@ #!*.txt !*.yml !*.yaml +!Snakefile # 3) add a pattern to track the file patterns of section2 even if they are in # subdirectories diff --git a/README.md b/README.md index 2c1b4be917799b85f51f9f94209a04b4be01825c..67ded0ccdb39b1780fd3d29298b7d873da65da04 100644 --- a/README.md +++ b/README.md @@ -3,31 +3,25 @@ With a population generated by msprime (or any VCF), generate structural variant Help for commands is available: ```bash -python main.py -h -python main.py msprime-vcf -h -python main.py sv-vcf -h +python tree_generation.py -h +python variants_generation.py -h ``` +# ! DOC IS NOT UPDATED + ## 1. Use a VCF as a base A VCF is used as a base for genotypes and the position and number of variants. This VCF can be create with `msprime` to generate a population with: ```bash -python main.py msprime-vcf <FAI> <POPULATION SIZE> <MUTATION RATE> <SAMPLE SIZE> +python tree_generation.py -fai <FAI> -p <POPULATION SIZE> -r <MUTATION RATE> -n <SAMPLE SIZE> ``` This command output as many VCF as there are chromosomes in the reference FAI. ## 2. Generate structural variants -Using the script `VISOR_random_bed/random_bed.sh`, generate a BED with structural variant informations. All structural variant types handled by VISOR are in the `VISOR_random_bed/visor_sv_type.yaml` file. Modify input in the script: -- generate as many variants as there are in the VCF -- choose structural variant types -- choose structural variant proportions (must equal 100) - -[How to use the randomregion.r script](https://davidebolo1993.github.io/visordoc/usecases/usecases.html#visor-hack) - -## 3. Get the final VCF -The final VCF is obtained by merging the msprime VCF, the VISOR BED and the structural variant sequences. ```bash -python main.py sv-vcf <BED> <VCF> <FASTA> <OUTNAME> +python variants_generation.py -fai <FAI> -fa <FASTA> -v <VCF> -y <YAML> ``` +The VCF is the one created with the previous command using `msprime`. +YAML template is available in `VISOR_random_bed` folder. Modify the YAML input to set the percentage of each variant you want to simulate (must equal 100). Each variant type has a size distribution file (bins = 100 bp) in folder `sv_distributions`. The data was extracted from [An integrated map of structural variation in 2,504 human genomes (Sudmant, et al. 2015)](https://www.nature.com/articles/nature15394). @@ -39,4 +33,4 @@ The distributions are used to randomly sample each structural variant size. - [ ] tandem repeat contraction - [ ] tandem repeat expansion - [ ] approximate tandem repetition -- [ ] Add VCF merging when there are multiple chromosomes \ No newline at end of file +- [x] Add VCF merging when there are multiple chromosomes \ No newline at end of file diff --git a/VISOR_random_bed/random_bed.sh b/VISOR_random_bed/random_bed.sh deleted file mode 100644 index 67a2a5c1768c87e3bed9b8d19a5862c58555f97b..0000000000000000000000000000000000000000 --- a/VISOR_random_bed/random_bed.sh +++ /dev/null @@ -1,8 +0,0 @@ -### create random BED -refFAI="/home/sukanya/tests/02_data/hackathon_Ztritici/CHR8/ztIPO323.chr8.fasta.fai" -GENOME="/home/sukanya/tests/02_data/hackathon_Ztritici/CHR8/ztIPO323.chr8.fasta" -VISOR_script="randomregion.r" - -# samtools faidx $GENOME -cut -f1,2 $refFAI > chrom.dim.tsv -Rscript $VISOR_script -d chrom.dim.tsv -n 10 -l 2 -v 'SNP,insertion,deletion,inversion' -r '70:10:10:10' -g $GENOME | sortBed > HACk.random.bed \ No newline at end of file diff --git a/VISOR_random_bed/randomregion.r b/VISOR_random_bed/randomregion.r deleted file mode 100644 index 64741c474459247199d2b6289c8be406786099b9..0000000000000000000000000000000000000000 --- a/VISOR_random_bed/randomregion.r +++ /dev/null @@ -1,222 +0,0 @@ -if (!requireNamespace("BiocManager", quietly = TRUE)) - install.packages("BiocManager",repos='http://cran.us.r-project.org') -if (!requireNamespace("optparse", quietly = TRUE)) - install.packages("optparse",repos='http://cran.us.r-project.org') -if (!requireNamespace("data.table", quietly = TRUE)) - install.packages("data.table",repos='http://cran.us.r-project.org') -if (!requireNamespace("regioneR", quietly = TRUE)) - BiocManager::install("regioneR") -if (!requireNamespace("Rsamtools", quietly = TRUE)) - BiocManager::install("Rsamtools") - - -suppressPackageStartupMessages(library(optparse)) -suppressPackageStartupMessages(library(data.table)) -suppressPackageStartupMessages(library(regioneR)) -suppressPackageStartupMessages(library(Rsamtools)) - -option_list = list( - make_option(c("-d", "--dimensions"), action="store", type='character', help="TSV file with chromosome dimensions (as result from 'cut -f1,2 genome.fa.fai') [required]"), - make_option(c("-n", "--number"), action="store", default=100, type='numeric', help="number of intervals [100]"), - make_option(c("-l", "--length"), action="store", default=50000, type='numeric', help="mean intervals length [50000]"), - make_option(c("-s", "--standarddev"), action="store", default=0, type='numeric', help="standard deviation for intervals length [0]"), - make_option(c("-x", "--exclude"), action="store", default=NULL, type='character', help="exclude regions in BED [optional]"), - make_option(c("-v", "--variants"), action="store", type='character', help="variants types (-v 'deletion,tandem duplication,inversion') [required]"), - make_option(c("-r", "--ratio"), action="store", type='character', help="variants proportions (-r '30:30:40')) [required]"), - make_option(c("-i", "--idhaplo"), action="store", type='numeric', default=1, help="haplotype number [1]"), - make_option(c("-m", "--microsatellites"), action="store", type='character', default=NULL, help="BED file with microsatellites (ucsc format) [optional]"), - make_option(c("-g", "--genome"), action="store", type="character", default=NULL, help="FASTA genome [optional]") -) - -opt = parse_args(OptionParser(option_list=option_list)) - -possiblevariants<-c('deletion', 'insertion', 'inversion', 'tandem duplication', 'inverted tandem duplication', 'translocation copy-paste', 'translocation cut-paste' , 'reciprocal translocation', 'SNP', 'MNP', 'tandem repeat contraction', 'tandem repeat expansion', 'perfect tandem repetition', 'approximate tandem repetition') -possiblemicro<-c('AAC','AAG','AAT','AC', 'ACA','ACC','ACT','AG','AGA','AGC','AGG','AT','ATA','ATC','ATG','ATT','CA','CAA','CAC','CAG','CAT','CCA','CCG','CCT','CT','CTA','CTC','CTG','CTT','GA','GAA','GAT','GCA','GCC','GGA','GGC','GT','GTG','GTT','TA','TAA','TAC','TAG','TAT','TC','TCA','TCC','TCT','TG','TGA','TGC','TGG','TGT','TTA','TTC','TTG') - -if (is.null(opt$dimensions)) { - stop('-d/--dimensions TSV file is required') -} else { - genome<-fread(file.path(opt$dimensions), sep='\t', header = F) -} - -if (is.null(opt$exclude)) { - exclude<-NULL -} else { - exclude<-fread(file.path(opt$exclude), sep='\t', header=F) -} - -if (is.null(opt$variants)) { - stop('variants in -v/--variants are required') -} - -if (is.null(opt$ratio)) { - stop('variants ratios in -r/--ratio are required') -} - -variants<-unlist(strsplit(opt$variants, ',')) - -if (!all(variants %in% possiblevariants)) { - stop('accepted variants are: ', paste(possiblevariants, collapse=', ')) -} - -testt<-c("tandem repeat contraction", "tandem repeat expansion") - -if (any(testt %in% variants)) { - if (is.null(opt$microsatellites)) { - stop('when adding contractions/expansions of known microsatellites, a BED file specifying known microsatellites must be given') - } else { - microbed<-fread(file.path(opt$microsatellites), sep='\t', header=FALSE) - if (!is.null(exclude)) { - exclude<-data.frame(rbind(exclude[,1:3],data.frame(V1=microbed$V1, V2=microbed$V2-1, V3=microbed$V3+1))) - } else { - exclude<-data.frame(V1=microbed$V1, V2=microbed$V2-2, V3=microbed$V3+2) - } - } -} - -testv<-c("SNP", "MNP") - -if (any(testv %in% variants)) { - if (is.null(opt$genome)) { - stop('when adding SNPs and MNPs, a genome FASTA must be given') - } else { - if (!file.exists(file.path(paste0(opt$genome, '.fai')))) { - indexFa(file.path(opt$genome)) - } - } -} - -number<-as.numeric(unlist(strsplit(opt$ratio, ':'))) - -if (cumsum(number)[length(cumsum(number))] != 100) { - stop('sum of variant ratios must be 100') -} - -if (length(variants) != length(number)) { - stop('for each variant a ratio must be specified') -} - -varwithproportions<-rep(variants, (opt$number/100)*number) - -#the number of regions here is all minus the number of repeat contractions/expansions for which we sample from known regions -keepdist<-varwithproportions[which(varwithproportions %in% testt)] -dft<-data.frame(matrix(NA, nrow = length(keepdist), ncol = 6), stringsAsFactors=FALSE) -colnames(dft)<-c("chromosome","start","end","type","info","breakseqlen") -if (length(keepdist) != 0) { - varwithproportions<-varwithproportions[-which(varwithproportions %in% testt)] #remove - randomicro<-microbed[sample(nrow(microbed), length(keepdist)),] - for (i in 1:nrow(randomicro)) { - dft$chromosome[i]<-randomicro$V1[i] - dft$start[i]<-as.numeric(randomicro$V2[i]) - dft$end[i]<-as.numeric(randomicro$V3[i]) - dft$type[i]<-keepdist[i] - motif<-unlist(strsplit(randomicro$V4[i],'x'))[2] - number_<-as.numeric(unlist(strsplit(randomicro$V4[i],'x'))[1]) - minimum<-1 - if (keepdist[i] == 'tandem repeat contraction') { - maximum<-number_-1 - } else {#is expansion - maximum<-500 - } - dft$info[i]<-paste(motif, sample(minimum:maximum,1),sep=':') - dft$breakseqlen[i]<-0 - } -} - -regions<-createRandomRegions(length(varwithproportions), length.mean=opt$length, length.sd=opt$standarddev, genome=genome, mask=exclude, non.overlapping=TRUE) - -# just in case we need another list of regions to exclude for translocation -newexclude<-data.frame(rbind(exclude[,1:3], cbind(as.character(seqnames(regions)), as.numeric(start(regions)-2), as.numeric(end(regions)+2)))) - -df <- data.frame(chromosome=seqnames(regions),start=start(regions),end=end(regions), type=varwithproportions, stringsAsFactors = FALSE) - -info<-rep('INFO',nrow(df)) -breakseqlen<-rep(0, nrow(df)) - -for(i in (1:nrow(df))) { - if (df$type[i] == 'deletion') { - info[i]<-'None' - breakseqlen[i]<-sample(0:10,1) - } else if (df$type[i] == 'tandem duplication') { - info[i] <- '2' - breakseqlen[i]<-sample(0:10,1) - } else if (df$type[i] == 'inversion'){ - info[i] <- 'None' - breakseqlen[i]<-sample(0:10,1) - } else if (df$type[i] == 'inverted tandem duplication') { - info[i] <- '2' - } else if (df$type[i] == 'insertion') { - df$start[i]<-df$end[i]-1 - num<-round(rnorm(1, mean=opt$length, sd=opt$standarddev)) #adapt mean and stdev for insertions to those specified by the user - alphabet<-c('A', 'T', 'C', 'G') - motif<-paste(sample(alphabet, num, replace = T), collapse='') - info[i]<-motif - breakseqlen[i]<-sample(0:10,1) - } else if (df$type[i] == 'perfect tandem repetition') { - df$start[i]<-df$end[i]-1 - motif<-sample(possiblemicro,1) - num<-sample(6:500,1) - info[i]<-paste(motif,num,sep=':') - breakseqlen[i]<-0 - } else if (df$type[i] == 'approximate tandem repetition') { - df$start[i]<-df$end[i]-1 - motif<-sample(possiblemicro,1) - num<-sample(6:500,1) - err<-sample(1:round(num/3),1) - info[i]<-paste(motif,num,err,sep=':') - breakseqlen[i]<-0 - } else if (df$type[i] == 'SNP') { - df$start[i]<-df$end[i]-1 - tog<-makeGRangesFromDataFrame(data.frame(chromosome=df$chromosome[i], start=df$end[i], end=df$end[i])) - nuc<-as.character(scanFa(opt$genome,tog))[[1]] - if (nuc == 'N') { - alphabet<-c('A', 'T', 'C', 'G', 'N') - } else { - alphabet<-c('A', 'T', 'C', 'G') - } - alphabet<-alphabet[-which(alphabet == nuc)] - info[i]<-sample(alphabet,1) - breakseqlen[i]<-0 - } else if(df$type[i] == 'MNP') { - smallseq<-sample(2:10,1) - df$start[i]<-df$end[i]-smallseq - tog<-makeGRangesFromDataFrame(data.frame(chromosome=df$chromosome[i], start=df$start[i], end=df$end[i])) - nucseq<-as.character(scanFa(opt$genome,tog))[[1]] - for (l in 1:nchar(nucseq)) { - nuc<-substr(nucseq, l, l) - if (nuc == 'N') { - alphabet<-c('A', 'T', 'C', 'G', 'N') - } else { - alphabet<-c('A', 'T', 'C', 'G') - } - alphabet<-alphabet[-which(alphabet == nuc)] - substr(nucseq, l, l)<-sample(alphabet,1) - } - info[i]<-nucseq - breakseqlen[i]<-0 - } else if (df$type[i] == 'translocation cut-paste' | df$type[i] == 'translocation copy-paste') { - newregion<-createRandomRegions(1, length.mean=opt$length, length.sd=opt$standarddev, genome=genome, mask=newexclude, non.overlapping=TRUE) - chromosome<-as.character(seqnames(newregion)) - start<-as.numeric(start(newregion)) - end<-as.numeric(end(newregion)) - orientation<-sample(c('forward', 'reverse'),1) - newexclude<-data.frame(rbind(newexclude,c(chromosome, start-2, end+2))) - info[i]<-paste(paste0('h',opt$idhaplo),chromosome,start,orientation,sep=':') - breakseqlen[i]<-sample(0:10,1) - } else { - newregion<-createRandomRegions(1, length.mean=opt$length, length.sd=opt$standarddev, genome=genome, mask=newexclude, non.overlapping=TRUE) - chromosome<-as.character(seqnames(newregion)) - start<-as.numeric(start(newregion)) - end<-as.numeric(end(newregion)) - orientation1<-sample(c('forward', 'reverse'),1) - orientation2<-sample(c('forward', 'reverse'),1) - newexclude<-data.frame(rbind(newexclude,c(chromosome, start-2, end+2))) - info[i]<-paste(paste0('h',opt$idhaplo+1),chromosome,start,orientation1, orientation2, sep=':') - breakseqlen[i]<-sample(0:10,1) - } -} - -final<-data.frame(df,info,breakseqlen, stringsAsFactors = FALSE) - -fwrite(rbind(final,dft),file = '',quote = FALSE, col.names = FALSE, row.names = FALSE, sep='\t') - diff --git a/VISOR_random_bed/visor_sv_type.yaml b/VISOR_random_bed/visor_sv_type.yaml index 557ca6f883dfa22f023fc9a42dc6858604dbe089..518bcaffc0c7edf4595fe3b6f5ae42f438d0ff7b 100644 --- a/VISOR_random_bed/visor_sv_type.yaml +++ b/VISOR_random_bed/visor_sv_type.yaml @@ -1,15 +1,10 @@ -# type de SV possible dans le script randomregion.r de VISOR +### percentage of each variant type in the simulation, sum of values should be 100 +SNP: 0 deletion: 0 insertion: 0 -inversion: 0 +inversion: 100 tandem duplication: 0 inverted tandem duplication: 0 translocation copy-paste: 0 translocation cut-paste : 0 -reciprocal translocation: 0 -SNP: 0 -# MNP: 0 -# tandem repeat contraction: 0 -# tandem repeat expansion: 0 -# perfect tandem repetition: 0 -# approximate tandem repetition: 0 \ No newline at end of file +reciprocal translocation: 0 \ No newline at end of file diff --git a/bed2vcf.py b/bed2vcf.py index 1506eff57bcd98ba8137e4e2755236bb4b46db91..3d3172c80da8c81598f4e6d1373e4002498ea3e1 100644 --- a/bed2vcf.py +++ b/bed2vcf.py @@ -1,7 +1,6 @@ import pandas as pd import re, random, json, os import numpy as np - from Bio import SeqIO from Bio.Seq import Seq @@ -15,6 +14,11 @@ def read_fa(input_file): def DNA(length): return ''.join(random.choice('CGTA') for _ in range(length)) +def SNP(aa): + s = 'ACGT' + s = s.replace(aa, '') + return(random.choice(s)) + # check if reverse def is_reverse(s): return(s=="reverse") @@ -30,11 +34,11 @@ def deletion(ref_seq, alt_seq): alt = str(ref_seq) return(ref,alt) -def set_ref_alt(ref, alt, row, vcf_df): - vcf_df.at[row, "REF"] = ref - vcf_df.at[row, "ALT"] = alt +def set_ref_alt(ref, alt, row, df): + df.at[row, "REF"] = ref + df.at[row, "ALT"] = alt -def get_random_len(svtype): +def get_random_len(svtype): # INS, DEL, DUP, CNV, INV df = pd.read_csv("sv_distributions/size_distrib" + svtype + ".tsv", sep="\t") pb = df["pb"].tolist() @@ -45,31 +49,38 @@ def get_random_len(svtype): s = np.random.uniform(interval[0], interval[1], 1).round() return(int(s[0])) -# get the sequence of each variants in BED -# output file is a VCF -# (currently, not all variant supported by VISOR are included) -# TODO : missing VISOR variant type : SNP, MNP and 4 types of tandem repeat - +### get the sequence of each variants in BED, output VCF # vcf_df : msprime VCF # bed_def : VISOR BED # fa_dict : FASTA where variants will be generated def get_seq(vcf_df, bed_df, fa_dict, output_file): - print(bed_df) -# number of variants - n = len(vcf_df) + cols = ["CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT"] + ncol = len(vcf_df.columns) - len(cols) + li = [i for i in range(ncol)] + cols.extend(li) + vcf_df.columns = cols + + df = vcf_df.join(bed_df) + # number of variants + n = len(df) + # add variants + series = [] for i in range(n): - sv_info = bed_df.iloc[i] - chr_name = sv_info[0] - start = sv_info[1] - start = start-1 # pour ajuster à l'index python - sv_type = sv_info[3] + ## nom du chr pris dans le VCF + chr_name = df["CHROM"].iloc[i] + ## position du variant pris dans le VCF + start = df["POS"].iloc[i] + start = int(start)-1 # pour ajuster à l'index python + ## type de SV à générer + sv_info = df['SVINFO'].iloc[i] + sv_type = df['SVTYPE'].iloc[i] fasta_seq = fa_dict[chr_name].upper() if sv_type == "SNP": - ref = str(fasta_seq.seq[start:end]) - alt = sv_info[4] + ref = str(fasta_seq.seq[start]) + alt = SNP(ref) elif sv_type == "deletion": end = start + get_random_len("DEL") @@ -91,10 +102,10 @@ def get_seq(vcf_df, bed_df, fa_dict, output_file): end = start + get_random_len("DUP") ref = str(fasta_seq.seq[start]) alt_seq1 = str(fasta_seq.seq[start+1:end]) - cp = int(sv_info[4])-1 + cp = sv_info-1 # multiplication pour obtenir les copies alt_seq = alt_seq1*cp - if sv_type =="inverted tandem duplication": + if sv_type == "inverted tandem duplication": alt_seq = reverse(alt_seq) alt = ref + alt_seq @@ -105,25 +116,22 @@ def get_seq(vcf_df, bed_df, fa_dict, output_file): end = start + l # informations sur la translocation - trans_info = sv_info[4] - print(trans_info, type(trans_info)) - infos = trans_info.split(":") - trans_start = int(infos[2])-1 + trans_chr = sv_info[0] + trans_start = sv_info[1]-1 trans_end = trans_start + l - trans_chr = infos[1] - + fasta_seq_trans = fa_dict[trans_chr].upper() + # translocation réciproque if sv_type == "reciprocal translocation": ref = str(fasta_seq.seq[start:end]) - fasta_seq_trans = fa_dict[trans_chr].upper() ref2 = str(fasta_seq_trans.seq[trans_start:trans_end]) - if infos[3] == "reverse": + if sv_info[2] == "reverse": alt = str(fasta_seq.seq[start]) + reverse(str(fasta_seq_trans.seq[trans_start+1:trans_end])) else : alt = str(fasta_seq.seq[start]) + str(fasta_seq_trans.seq[trans_start+1:trans_end]) - if infos[4] == "reverse": + if sv_info[3] == "reverse": alt2 = str(fasta_seq_trans.seq[trans_start]) + reverse(str(fasta_seq.seq[start+1:end])) else: alt2 = str(fasta_seq_trans.seq[trans_start]) + str(fasta_seq.seq[start+1:end]) @@ -136,7 +144,7 @@ def get_seq(vcf_df, bed_df, fa_dict, output_file): # insertion ref2 = str(fasta_seq_trans.seq[trans_start]) - if infos[3] == "reverse": + if sv_info[2] == "reverse": alt2 = ref2 + reverse(str(fasta_seq.seq[start+1:end])) else : alt2 = ref2 + str(fasta_seq.seq[start+1:end]) @@ -144,7 +152,7 @@ def get_seq(vcf_df, bed_df, fa_dict, output_file): # copier-coller else: ref = str(fasta_seq.seq[start]) - alt = vcf_df["ALT"].iloc[i] + alt = SNP(ref) ref2 = str(fasta_seq_trans.seq[trans_start]) alt2 = ref2 + str(fasta_seq.seq[start+1:end]) @@ -154,14 +162,19 @@ def get_seq(vcf_df, bed_df, fa_dict, output_file): new_vcf_var[1] = trans_start+1 new_vcf_var[3] = ref2 new_vcf_var[4] = alt2 - vcf_df.loc[len(vcf_df)] = new_vcf_var + save_series = pd.Series(new_vcf_var, index=cols) + series.append(save_series) + + set_ref_alt(ref, alt, i, df) - set_ref_alt(ref, alt, i, vcf_df) + df_add = pd.DataFrame(series) + df = pd.concat([df, df_add], ignore_index=True) + df = df.drop(['SVTYPE', 'SVINFO'], axis=1) # remove unnecessary columns for VCF - vcf_df["ID"] = "." - vcf_df["QUAL"] = 99 + df["ID"] = "." + df["QUAL"] = 99 # output VCF if not os.path.exists("results"): os.mkdir("results") - vcf_df.to_csv("results/" + output_file + ".vcf", sep="\t", header=False, index=False) \ No newline at end of file + df.to_csv("results/" + output_file + ".vcf", sep="\t", header=False, index=False) \ No newline at end of file diff --git a/fusion.py b/fusion.py deleted file mode 100644 index ea0696c006791743574da20139f30d3ae4f0d7d4..0000000000000000000000000000000000000000 --- a/fusion.py +++ /dev/null @@ -1,42 +0,0 @@ -# fusionner le VCF de msprime (position, génotype) et le BED de VISOR -# pour obtenir un BED avec tous les variants -# récupérer les variants et les génotypes msprime -import io, os, shutil -import pandas as pd - -def read_vcf(input_file): - with open(input_file, "r") as txt: - t = txt.read() - txt.close() - - # write header for later - if not os.path.exists("results"): - os.mkdir("results") - f = open("results/vcf_header.txt", "w") - f.write(''.join(t.splitlines(keepends=True)[:6])) - f.close() - - # remove VCF header for dataframe - t = ''.join(t.splitlines(keepends=True)[5:]) - df = pd.read_table(io.StringIO(t)) - df = df.rename(columns={"#CHROM" : "CHROM"}) - return(df) - -def read_BED(input_file): - colnames=["chr", "start", "end", "type","info","breakpoint"] - df = pd.read_table(input_file, header=None, names=colnames) - return(df) - -def replace_bed_col(input_BED, input_VCF): - vcf = read_vcf(input_VCF) - bed = read_BED(input_BED) - if len(bed) == len(vcf): - bed["chr"] = vcf["CHROM"] - bed["start"] = vcf["POS"] - return(bed) - -def merge_final_vcf(header_file, content_file, merged_file): - with open(merged_file, 'wb') as outfile: - for filename in [header_file, content_file]: - with open(filename, 'rb') as infile: - shutil.copyfileobj(infile, outfile) \ No newline at end of file diff --git a/main.py b/main.py deleted file mode 100644 index da8b303e1873f857b436809ca1336aff42301de5..0000000000000000000000000000000000000000 --- a/main.py +++ /dev/null @@ -1,45 +0,0 @@ -from bed2vcf import read_fa, get_seq -from fusion import * -from tree_generation import msprime_vcf -import defopt - -## n : length of variant -def sv_vcf(bed: str, vcf: str, fasta: str, outName: str): - """ - Create a VCF with structural variants. - - :parameter bed: BED file of structural variants to include - :parameter vcf: VCF generated with msprime --> tree generation script - :parameter fasta: Reference FASTA for the VCF and the variants - :parameter outName: Output name - - """ - bed_df = replace_bed_col(bed, vcf) - vcf_df=read_vcf(vcf) - fa = read_fa(fasta) - - get_seq(vcf_df, bed_df, fa, outName) - - header_file="results/vcf_header.txt" - content_file = "results/" + outName + ".vcf" - merged_file="results/" + outName + "_final.vcf" - merge_final_vcf(header_file, content_file, merged_file) - -if __name__ == '__main__': - defopt.run([sv_vcf, msprime_vcf]) - -bed = "test/mini_random.bed" -vcf = "test/output.vcf" -fasta = "/home/sukanya/tests/02_data/hackathon_Ztritici/CHR8/ztIPO323.chr8.fasta" - -output_file="tritici_test" -# main(bed, vcf, fasta, output_file) - -### generate msprime population VCF -# fai = "ztIPO323.chr8.fasta.fai" - -# ## samtools FAI -# ## population size -# ## mutation rate -# ## sample size -# msprime_vcf(fai, 5000, 1e-8, 60) \ No newline at end of file diff --git a/randombed.py b/randombed.py new file mode 100644 index 0000000000000000000000000000000000000000..3f15a719a596e4a1eb6c5bfc722b159e9bf29833 --- /dev/null +++ b/randombed.py @@ -0,0 +1,72 @@ +import pandas as pd +import yaml, random, re + +## read yaml file containing variants informations +def read_yaml(f): + stream = open(f, 'r') + d = yaml.safe_load(stream) + stream.close() + if sum(d.values()) != 100: + raise ValueError("Sum of values must be 100") + return(d) + +infos_dict = {'deletion': None, 'insertion': None, 'inversion': None, 'SNP': None, + 'tandem duplication': 2, 'inverted tandem duplication': 2, + 'translocation copy-paste': 0, 'translocation cut-paste': 0, 'reciprocal translocation': 0} + +## generate info field for translocations +# randomly select chromosome to add sequence to +def select_chr(fai): + df = pd.read_table(fai, header=None, usecols=[0,1], names =["name", "length"]) + names=df["name"].to_list() + chr = random.choice(names) + df = pd.pivot_table(df, columns='name') + len = df.iloc[0][chr] + pos = random.randrange(len) + li = [chr, pos] + return(li) + +# randomly select sequence orientation +def select_orientation(li): + li.append(random.choice(["forward", "reverse"])) + return(li) + +## create dataframe with all the variants to simulate +def generate_type(nvar, yml, fai): + d = read_yaml(yml) + ## list for variant type and info + vartype = [] + infos = [] + ## filter dict to keep value != 0 + dict = {k: v for k, v in d.items() if v != 0} + for k in dict: + # generate n variants + perc = dict[k] + n = round(perc*nvar/100) + # add info field + if re.search("translocation", k): + for i in range(n): + info = select_chr(fai) + info = select_orientation(info) + if k == "reciprocal translocation": + info = select_orientation(info) + infos.extend([info]) + i += 1 + else: + info = infos_dict[k] + infos.extend([info]*n) + vartype.extend([k]*n) + + if len(vartype) != nvar: + n = nvar - len(vartype) + vartype.extend(["SNP"]*n) + infos.extend([None]*n) + + for_df = {'SVTYPE': vartype, 'SVINFO': infos} + + df = pd.DataFrame(data = for_df) + # shuffle values in dataframe + df = df.sample(frac = 1, ignore_index=True) + # print(df) + df.to_csv("random_var.tsv", sep="\t", index=False) + return(df) diff --git a/tree_generation.py b/tree_generation.py index 68b67f3e910c0aedde3f57d1ae629eb33022f610..223b71d494e4463a9aa380114da056c913144288 100644 --- a/tree_generation.py +++ b/tree_generation.py @@ -1,6 +1,7 @@ import pandas as pd -import math, msprime +import math, msprime, argparse, os import numpy as np +from multiprocessing import Process def chrom_pos(df): l=df["length"].to_list() @@ -28,19 +29,42 @@ def msprime_map(df): rate_map=msprime.RateMap(position=map_positions, rate=rates) return(rate_map) +### ts_chroms = list of trees for each chromosome +### names = list of chromosome name +def output_files(ts_chroms, names, outdir="results"): + if not os.path.exists(outdir): + os.mkdir(outdir) + + ## write vcf + filename = outdir + "/msprime_simulation.vcf" + for i in range(len(ts_chroms)): + txt = ts_chroms[i].as_vcf(contig_id=names[i]) + t = ''.join(txt.splitlines(keepends=True)[6:]) + # filename = 'tmp_vcf/output' + str(i) + ".vcf" + if i==0: + header_top = ''.join(txt.splitlines(keepends=True)[:4]) + header_bot = ''.join(txt.splitlines(keepends=True)[4:6]) + with open(filename, "w") as vcf_file: + vcf_file.write(t) + # ts_chroms[i].write_vcf(vcf_file, contig_id=names[i]) + else: + header_top += txt.splitlines(keepends=True)[3] + + with open(filename, "a") as vcf_file: + vcf_file.write(t) + vcf_file.close() + ## write header to file + header = header_top + header_bot + with open(outdir + "/header.vcf", "w") as head: + head.write(header) + head.close() + # input_fai = samtools FAI of FASTA where variants will be generated # pop_size = population size # mut_rate = mutation rate # n = sample size / number of indiv -def msprime_vcf(fai: str, pop_size: int, mut_rate: float, n: int): - """ - Generate VCF for each chromosome in reference FASTA. - - :parameter fai: FAI samtools index of reference FASTA - :parameter pop_size: population size (Ne) - :parameter mut_rate: mutation rate (µ) - :parameter n: sample size - """ +def msprime_vcf(fai, pop_size, mut_rate, n, outdir): + df = pd.read_table(fai, header=None, usecols=[0,1], names =["name", "length"]) rate_map=msprime_map(df) @@ -55,7 +79,7 @@ def msprime_vcf(fai: str, pop_size: int, mut_rate: float, n: int): mutated_ts = msprime.sim_mutations( ts, rate=mut_rate, # random_seed=5678, - discrete_genome=False) + discrete_genome=True) ts_chroms = [] @@ -69,20 +93,26 @@ def msprime_vcf(fai: str, pop_size: int, mut_rate: float, n: int): start, end = chrom_positions[j: j + 2] chrom_ts = mutated_ts.keep_intervals([[start, end]], simplify=False).trim() ts_chroms.append(chrom_ts) - print(chrom_ts.sequence_length) names=df["name"].to_list() - for i in range(len(ts_chroms)): - filename="msprime_output_chr.vcf" - if i==0: - with open(filename, "w") as vcf_file: - ts_chroms[i].write_vcf(vcf_file, contig_id=names[i]) - else: - txt = ts_chroms[i].as_vcf(contig_id=names[i]) - t = ''.join(txt.splitlines(keepends=True)[6:]) - with open(filename, "a") as vcf_file: - vcf_file.write(t) - vcf_file.close() + ## create files + if outdir == None: + output_files(ts_chroms, names) + else: + output_files(ts_chroms, names, outdir) + +parser = argparse.ArgumentParser(description='Generate VCF for each chromosome in reference FASTA.') +parser.add_argument('-fai', '--fai', type=str, required = True, help='FAI samtools index of reference FASTA') +parser.add_argument('-p', '--popSize', type=int, required = True, help='population size (Ne)') +parser.add_argument('-r', '--rate', type=float, required = True, help='mutation rate (µ)') +parser.add_argument('-n', '--sampleSize', type=int, required = True, help='sample size') +parser.add_argument('-o', '--outDir', type=str, help='output directory') - print(len(ts_chroms)) \ No newline at end of file +if __name__ == '__main__': + args = parser.parse_args() + d = vars(args) + li = list(d.values()) + p = Process(target=msprime_vcf, args=(li)) + p.start() + p.join() \ No newline at end of file diff --git a/variants_generation.py b/variants_generation.py new file mode 100644 index 0000000000000000000000000000000000000000..df622cbf6a3a96a2a46643840382cfbe286c4cea --- /dev/null +++ b/variants_generation.py @@ -0,0 +1,32 @@ +from bed2vcf import read_fa, get_seq +from randombed import generate_type +import argparse +import pandas as pd +from multiprocessing import Process + +## n : length of variant +def sv_vcf(vcf, fasta, fai, yml, outName): + vcf_df = pd.read_table(vcf, sep="\t") + nvar = len(vcf_df) + bed_df = generate_type(nvar, yml, fai) + fa = read_fa(fasta) + + get_seq(vcf_df, bed_df, fa, outName) + +# parse arguments +parser = argparse.ArgumentParser(description='create final VCF') +# parser.add_argument('-b','--bed', type=str, required = True, help='BED file of structural variants to include') +parser.add_argument('-v', '--vcf', type=str, required = True, help='VCF generated with msprime --> tree generation script') +parser.add_argument('-fa', '--fasta', type=str, required = True, help='Reference FASTA for the VCF and the variants') +parser.add_argument('-fai', '--fai', type=str, required = True, help='Samtools index of reference FASTA') +parser.add_argument('-y', '--yaml', type=str, required = True, help='Reference FASTA for the VCF and the variants') +parser.add_argument('-o', '--outName', type=str, required = True, help='Output name') +# parser.add_argument('--header', type=int, help='vcf header size') + +if __name__ == '__main__': + args = parser.parse_args() + d = vars(args) + li = list(d.values()) + p = Process(target=sv_vcf, args=(li)) + p.start() + p.join() \ No newline at end of file