diff --git a/.gitignore b/.gitignore
index f7f2ef469c92ae120744e09ed58b10c34579b3a1..103f54c8529341bef7c08d7aa6f56d9d3cfbe88c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -48,6 +48,7 @@
 #!*.txt
 !*.yml
 !*.yaml
+!Snakefile
 
 # 3) add a pattern to track the file patterns of section2 even if they are in
     # subdirectories
diff --git a/README.md b/README.md
index 2c1b4be917799b85f51f9f94209a04b4be01825c..67ded0ccdb39b1780fd3d29298b7d873da65da04 100644
--- a/README.md
+++ b/README.md
@@ -3,31 +3,25 @@ With a population generated by msprime (or any VCF), generate structural variant
 
 Help for commands is available:
 ```bash
-python main.py -h
-python main.py msprime-vcf -h
-python main.py sv-vcf -h
+python tree_generation.py -h
+python variants_generation.py -h
 ```
 
+# ! DOC IS NOT UPDATED
+
 ## 1. Use a VCF as a base
 A VCF is used as a base for genotypes and the position and number of variants. This VCF can be create with `msprime` to generate a population with:
 ```bash
-python main.py msprime-vcf <FAI> <POPULATION SIZE> <MUTATION RATE> <SAMPLE SIZE> 
+python tree_generation.py -fai <FAI> -p <POPULATION SIZE> -r <MUTATION RATE> -n <SAMPLE SIZE>
 ```
 This command output as many VCF as there are chromosomes in the reference FAI.
 
 ## 2. Generate structural variants
-Using the script `VISOR_random_bed/random_bed.sh`, generate a BED with structural variant informations. All structural variant types handled by VISOR are in the `VISOR_random_bed/visor_sv_type.yaml` file. Modify input in the script:
-- generate as many variants as there are in the VCF
-- choose structural variant types
-- choose structural variant proportions (must equal 100)
-
-[How to use the randomregion.r script](https://davidebolo1993.github.io/visordoc/usecases/usecases.html#visor-hack)
-
-## 3. Get the final VCF
-The final VCF is obtained by merging the msprime VCF, the VISOR BED and the structural variant sequences.
 ```bash
-python main.py sv-vcf <BED> <VCF> <FASTA> <OUTNAME>
+python variants_generation.py -fai <FAI> -fa <FASTA> -v <VCF> -y <YAML>
 ```
+The VCF is the one created with the previous command using `msprime`.
+YAML template is available in `VISOR_random_bed` folder. Modify the YAML input to set the percentage of each variant you want to simulate (must equal 100).
 
 Each variant type has a size distribution file (bins = 100 bp) in folder `sv_distributions`. The data was extracted from [An integrated map of structural variation in 2,504 human genomes (Sudmant, et al. 2015)](https://www.nature.com/articles/nature15394).
 
@@ -39,4 +33,4 @@ The distributions are used to randomly sample each structural variant size.
     - [ ] tandem repeat contraction
     - [ ] tandem repeat expansion
     - [ ] approximate tandem repetition
-- [ ] Add VCF merging when there are multiple chromosomes
\ No newline at end of file
+- [x] Add VCF merging when there are multiple chromosomes
\ No newline at end of file
diff --git a/VISOR_random_bed/random_bed.sh b/VISOR_random_bed/random_bed.sh
deleted file mode 100644
index 67a2a5c1768c87e3bed9b8d19a5862c58555f97b..0000000000000000000000000000000000000000
--- a/VISOR_random_bed/random_bed.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-### create random BED
-refFAI="/home/sukanya/tests/02_data/hackathon_Ztritici/CHR8/ztIPO323.chr8.fasta.fai"
-GENOME="/home/sukanya/tests/02_data/hackathon_Ztritici/CHR8/ztIPO323.chr8.fasta"
-VISOR_script="randomregion.r"
-
-# samtools faidx $GENOME
-cut -f1,2 $refFAI > chrom.dim.tsv
-Rscript $VISOR_script -d chrom.dim.tsv -n 10 -l 2 -v 'SNP,insertion,deletion,inversion' -r '70:10:10:10' -g $GENOME | sortBed > HACk.random.bed
\ No newline at end of file
diff --git a/VISOR_random_bed/randomregion.r b/VISOR_random_bed/randomregion.r
deleted file mode 100644
index 64741c474459247199d2b6289c8be406786099b9..0000000000000000000000000000000000000000
--- a/VISOR_random_bed/randomregion.r
+++ /dev/null
@@ -1,222 +0,0 @@
-if (!requireNamespace("BiocManager", quietly = TRUE))
-  install.packages("BiocManager",repos='http://cran.us.r-project.org')
-if (!requireNamespace("optparse", quietly = TRUE))
-  install.packages("optparse",repos='http://cran.us.r-project.org')
-if (!requireNamespace("data.table", quietly = TRUE))
-  install.packages("data.table",repos='http://cran.us.r-project.org') 
-if (!requireNamespace("regioneR", quietly = TRUE))
-  BiocManager::install("regioneR")
-if (!requireNamespace("Rsamtools", quietly = TRUE))
-  BiocManager::install("Rsamtools")
-
-
-suppressPackageStartupMessages(library(optparse))
-suppressPackageStartupMessages(library(data.table))
-suppressPackageStartupMessages(library(regioneR))
-suppressPackageStartupMessages(library(Rsamtools))
-
-option_list = list(
-  make_option(c("-d", "--dimensions"), action="store", type='character', help="TSV file with chromosome dimensions (as result from 'cut -f1,2 genome.fa.fai') [required]"),
-  make_option(c("-n", "--number"), action="store", default=100, type='numeric', help="number of intervals [100]"),
-  make_option(c("-l", "--length"), action="store", default=50000, type='numeric', help="mean intervals length [50000]"),
-  make_option(c("-s", "--standarddev"), action="store", default=0, type='numeric', help="standard deviation for intervals length [0]"),
-  make_option(c("-x", "--exclude"), action="store", default=NULL, type='character', help="exclude regions in BED [optional]"),
-  make_option(c("-v", "--variants"), action="store", type='character', help="variants types (-v 'deletion,tandem duplication,inversion') [required]"),
-  make_option(c("-r", "--ratio"), action="store", type='character', help="variants proportions (-r '30:30:40')) [required]"),
-  make_option(c("-i", "--idhaplo"), action="store", type='numeric', default=1, help="haplotype number [1]"),
-  make_option(c("-m", "--microsatellites"), action="store", type='character', default=NULL, help="BED file with microsatellites (ucsc format) [optional]"),
-  make_option(c("-g", "--genome"), action="store", type="character", default=NULL, help="FASTA genome [optional]")
-)
-
-opt = parse_args(OptionParser(option_list=option_list))
-
-possiblevariants<-c('deletion', 'insertion', 'inversion', 'tandem duplication', 'inverted tandem duplication', 'translocation copy-paste', 'translocation cut-paste' ,  'reciprocal translocation', 'SNP', 'MNP', 'tandem repeat contraction', 'tandem repeat expansion', 'perfect tandem repetition', 'approximate tandem repetition')
-possiblemicro<-c('AAC','AAG','AAT','AC', 'ACA','ACC','ACT','AG','AGA','AGC','AGG','AT','ATA','ATC','ATG','ATT','CA','CAA','CAC','CAG','CAT','CCA','CCG','CCT','CT','CTA','CTC','CTG','CTT','GA','GAA','GAT','GCA','GCC','GGA','GGC','GT','GTG','GTT','TA','TAA','TAC','TAG','TAT','TC','TCA','TCC','TCT','TG','TGA','TGC','TGG','TGT','TTA','TTC','TTG')
-
-if (is.null(opt$dimensions)) {
-  stop('-d/--dimensions TSV file is required')
-} else {
-  genome<-fread(file.path(opt$dimensions), sep='\t', header = F)
-}
-
-if (is.null(opt$exclude)) {
-  exclude<-NULL
-} else {
-  exclude<-fread(file.path(opt$exclude), sep='\t', header=F)
-}
-
-if (is.null(opt$variants)) {
-  stop('variants in -v/--variants are required')
-}
-
-if (is.null(opt$ratio)) {
-  stop('variants ratios in -r/--ratio are required')
-}
-
-variants<-unlist(strsplit(opt$variants, ','))
-
-if (!all(variants %in% possiblevariants)) {
-  stop('accepted variants are: ', paste(possiblevariants, collapse=', '))
-}
-
-testt<-c("tandem repeat contraction", "tandem repeat expansion")
-
-if (any(testt %in% variants)) {
-  if (is.null(opt$microsatellites)) {
-    stop('when adding contractions/expansions of known microsatellites, a BED file specifying known microsatellites must be given')
-  } else {
-    microbed<-fread(file.path(opt$microsatellites), sep='\t', header=FALSE)
-    if (!is.null(exclude)) {
-      exclude<-data.frame(rbind(exclude[,1:3],data.frame(V1=microbed$V1, V2=microbed$V2-1, V3=microbed$V3+1)))
-    } else {
-      exclude<-data.frame(V1=microbed$V1, V2=microbed$V2-2, V3=microbed$V3+2)
-    }
-  }
-}
-
-testv<-c("SNP", "MNP")
-
-if (any(testv %in% variants)) {
-  if (is.null(opt$genome)) {
-    stop('when adding SNPs and MNPs, a genome FASTA must be given')
-  } else {
-    if (!file.exists(file.path(paste0(opt$genome, '.fai')))) {
-      indexFa(file.path(opt$genome))
-    }
-  }
-}
-
-number<-as.numeric(unlist(strsplit(opt$ratio, ':')))
-
-if (cumsum(number)[length(cumsum(number))] != 100) {
-  stop('sum of variant ratios must be 100')
-}
-
-if (length(variants) != length(number)) {
-  stop('for each variant a ratio must be specified')
-}
-
-varwithproportions<-rep(variants, (opt$number/100)*number)
-
-#the number of regions here is all minus the number of repeat contractions/expansions for which we sample from known regions
-keepdist<-varwithproportions[which(varwithproportions %in% testt)]
-dft<-data.frame(matrix(NA, nrow = length(keepdist), ncol = 6), stringsAsFactors=FALSE)
-colnames(dft)<-c("chromosome","start","end","type","info","breakseqlen")
-if (length(keepdist) != 0) {
-  varwithproportions<-varwithproportions[-which(varwithproportions %in% testt)] #remove
-  randomicro<-microbed[sample(nrow(microbed), length(keepdist)),]
-  for (i in 1:nrow(randomicro)) {
-    dft$chromosome[i]<-randomicro$V1[i]
-    dft$start[i]<-as.numeric(randomicro$V2[i])
-    dft$end[i]<-as.numeric(randomicro$V3[i])
-    dft$type[i]<-keepdist[i]
-    motif<-unlist(strsplit(randomicro$V4[i],'x'))[2]
-    number_<-as.numeric(unlist(strsplit(randomicro$V4[i],'x'))[1])
-    minimum<-1
-    if (keepdist[i] == 'tandem repeat contraction') {
-      maximum<-number_-1
-    } else {#is expansion
-      maximum<-500
-    }
-    dft$info[i]<-paste(motif, sample(minimum:maximum,1),sep=':')
-    dft$breakseqlen[i]<-0
-  }
-}
-
-regions<-createRandomRegions(length(varwithproportions), length.mean=opt$length, length.sd=opt$standarddev, genome=genome, mask=exclude, non.overlapping=TRUE)
-
-# just in case we need another list of regions to exclude for translocation
-newexclude<-data.frame(rbind(exclude[,1:3], cbind(as.character(seqnames(regions)), as.numeric(start(regions)-2), as.numeric(end(regions)+2))))
-
-df <- data.frame(chromosome=seqnames(regions),start=start(regions),end=end(regions), type=varwithproportions, stringsAsFactors = FALSE)
-
-info<-rep('INFO',nrow(df))
-breakseqlen<-rep(0, nrow(df))
-
-for(i in (1:nrow(df))) {
-  if (df$type[i] == 'deletion') {
-    info[i]<-'None'
-    breakseqlen[i]<-sample(0:10,1)
-  } else if (df$type[i] == 'tandem duplication') {
-    info[i] <- '2'
-    breakseqlen[i]<-sample(0:10,1)
-  } else if (df$type[i] == 'inversion'){
-    info[i] <- 'None'
-    breakseqlen[i]<-sample(0:10,1)
-  } else if (df$type[i] == 'inverted tandem duplication') {
-    info[i] <- '2'
-  } else if (df$type[i] == 'insertion') {
-    df$start[i]<-df$end[i]-1
-    num<-round(rnorm(1, mean=opt$length, sd=opt$standarddev)) #adapt mean and stdev for insertions to those specified by the user
-    alphabet<-c('A', 'T', 'C', 'G')
-    motif<-paste(sample(alphabet, num, replace = T), collapse='')
-    info[i]<-motif
-    breakseqlen[i]<-sample(0:10,1)    
-  } else if (df$type[i] == 'perfect tandem repetition') {
-    df$start[i]<-df$end[i]-1
-    motif<-sample(possiblemicro,1)
-    num<-sample(6:500,1)
-    info[i]<-paste(motif,num,sep=':')
-    breakseqlen[i]<-0
-  } else if (df$type[i] == 'approximate tandem repetition') {
-    df$start[i]<-df$end[i]-1
-    motif<-sample(possiblemicro,1)
-    num<-sample(6:500,1)
-    err<-sample(1:round(num/3),1)
-    info[i]<-paste(motif,num,err,sep=':')
-    breakseqlen[i]<-0
-  } else if (df$type[i] == 'SNP') {
-    df$start[i]<-df$end[i]-1
-    tog<-makeGRangesFromDataFrame(data.frame(chromosome=df$chromosome[i], start=df$end[i], end=df$end[i]))
-    nuc<-as.character(scanFa(opt$genome,tog))[[1]]
-    if (nuc == 'N') {
-      alphabet<-c('A', 'T', 'C', 'G', 'N')
-    } else {
-      alphabet<-c('A', 'T', 'C', 'G')
-    }
-    alphabet<-alphabet[-which(alphabet == nuc)]
-    info[i]<-sample(alphabet,1)
-    breakseqlen[i]<-0
-  } else if(df$type[i] == 'MNP') {
-    smallseq<-sample(2:10,1)
-    df$start[i]<-df$end[i]-smallseq
-    tog<-makeGRangesFromDataFrame(data.frame(chromosome=df$chromosome[i], start=df$start[i], end=df$end[i]))
-    nucseq<-as.character(scanFa(opt$genome,tog))[[1]]
-    for (l in 1:nchar(nucseq)) {
-      nuc<-substr(nucseq, l, l)
-      if (nuc == 'N') {
-        alphabet<-c('A', 'T', 'C', 'G', 'N')
-      } else {
-        alphabet<-c('A', 'T', 'C', 'G')
-      }
-      alphabet<-alphabet[-which(alphabet == nuc)]
-      substr(nucseq, l, l)<-sample(alphabet,1)
-    }
-    info[i]<-nucseq
-    breakseqlen[i]<-0
-  } else if (df$type[i] == 'translocation cut-paste' | df$type[i] == 'translocation copy-paste') {
-    newregion<-createRandomRegions(1, length.mean=opt$length, length.sd=opt$standarddev, genome=genome, mask=newexclude, non.overlapping=TRUE)
-    chromosome<-as.character(seqnames(newregion))
-    start<-as.numeric(start(newregion))
-    end<-as.numeric(end(newregion))
-    orientation<-sample(c('forward', 'reverse'),1)
-    newexclude<-data.frame(rbind(newexclude,c(chromosome, start-2, end+2)))
-    info[i]<-paste(paste0('h',opt$idhaplo),chromosome,start,orientation,sep=':')
-    breakseqlen[i]<-sample(0:10,1) 
-  } else {
-    newregion<-createRandomRegions(1, length.mean=opt$length, length.sd=opt$standarddev, genome=genome, mask=newexclude, non.overlapping=TRUE)
-    chromosome<-as.character(seqnames(newregion))
-    start<-as.numeric(start(newregion))
-    end<-as.numeric(end(newregion))
-    orientation1<-sample(c('forward', 'reverse'),1)
-    orientation2<-sample(c('forward', 'reverse'),1)
-    newexclude<-data.frame(rbind(newexclude,c(chromosome, start-2, end+2)))
-    info[i]<-paste(paste0('h',opt$idhaplo+1),chromosome,start,orientation1, orientation2, sep=':')
-    breakseqlen[i]<-sample(0:10,1) 
-  }
-} 
-
-final<-data.frame(df,info,breakseqlen, stringsAsFactors = FALSE)
-
-fwrite(rbind(final,dft),file = '',quote = FALSE, col.names = FALSE, row.names = FALSE, sep='\t')
-
diff --git a/VISOR_random_bed/visor_sv_type.yaml b/VISOR_random_bed/visor_sv_type.yaml
index 557ca6f883dfa22f023fc9a42dc6858604dbe089..518bcaffc0c7edf4595fe3b6f5ae42f438d0ff7b 100644
--- a/VISOR_random_bed/visor_sv_type.yaml
+++ b/VISOR_random_bed/visor_sv_type.yaml
@@ -1,15 +1,10 @@
-# type de SV possible dans le script randomregion.r de VISOR
+### percentage of each variant type in the simulation, sum of values should be 100
+SNP: 0
 deletion: 0
 insertion: 0
-inversion: 0
+inversion: 100
 tandem duplication: 0
 inverted tandem duplication: 0
 translocation copy-paste: 0
 translocation cut-paste : 0
-reciprocal translocation: 0
-SNP: 0
-# MNP: 0
-# tandem repeat contraction: 0
-# tandem repeat expansion: 0
-# perfect tandem repetition: 0
-# approximate tandem repetition: 0
\ No newline at end of file
+reciprocal translocation: 0
\ No newline at end of file
diff --git a/bed2vcf.py b/bed2vcf.py
index 1506eff57bcd98ba8137e4e2755236bb4b46db91..3d3172c80da8c81598f4e6d1373e4002498ea3e1 100644
--- a/bed2vcf.py
+++ b/bed2vcf.py
@@ -1,7 +1,6 @@
 import pandas as pd
 import re, random, json, os
 import numpy as np
-
 from Bio import SeqIO
 from Bio.Seq import Seq
 
@@ -15,6 +14,11 @@ def read_fa(input_file):
 def DNA(length):
     return ''.join(random.choice('CGTA') for _ in range(length))
 
+def SNP(aa):
+	s = 'ACGT'
+	s = s.replace(aa, '')
+	return(random.choice(s))
+
 # check if reverse
 def is_reverse(s):
 	return(s=="reverse")
@@ -30,11 +34,11 @@ def deletion(ref_seq, alt_seq):
 	alt = str(ref_seq)
 	return(ref,alt)
 
-def set_ref_alt(ref, alt, row, vcf_df):
-	vcf_df.at[row, "REF"] = ref
-	vcf_df.at[row, "ALT"] = alt
+def set_ref_alt(ref, alt, row, df):
+	df.at[row, "REF"] = ref
+	df.at[row, "ALT"] = alt
 
-def get_random_len(svtype):
+def get_random_len(svtype):	
 	# INS, DEL, DUP, CNV, INV
 	df = pd.read_csv("sv_distributions/size_distrib" + svtype + ".tsv", sep="\t")
 	pb = df["pb"].tolist()
@@ -45,31 +49,38 @@ def get_random_len(svtype):
 	s = np.random.uniform(interval[0], interval[1], 1).round()
 	return(int(s[0]))
 
-# get the sequence of each variants in BED
-# output file is a VCF
-# (currently, not all variant supported by VISOR are included)
-# TODO : missing VISOR variant type : SNP, MNP and 4 types of tandem repeat
-
+### get the sequence of each variants in BED, output VCF
 # vcf_df : msprime VCF
 # bed_def : VISOR BED
 # fa_dict : FASTA where variants will be generated
 def get_seq(vcf_df, bed_df, fa_dict, output_file):
-	print(bed_df)
-# number of variants
-	n = len(vcf_df)
+	cols = ["CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT"]
+	ncol = len(vcf_df.columns) - len(cols)
+	li = [i for i in range(ncol)]
+	cols.extend(li)
+	vcf_df.columns = cols
+
+	df = vcf_df.join(bed_df)
+	# number of variants
+	n = len(df)
+	# add variants
+	series = []
 
 	for i in range(n):
-		sv_info = bed_df.iloc[i]
-		chr_name = sv_info[0]
-		start = sv_info[1]
-		start = start-1 # pour ajuster à l'index python
-		sv_type = sv_info[3]
+		## nom du chr pris dans le VCF
+		chr_name = df["CHROM"].iloc[i]
+		## position du variant pris dans le VCF
+		start = df["POS"].iloc[i]
+		start = int(start)-1 # pour ajuster à l'index python
+		## type de SV à générer
+		sv_info = df['SVINFO'].iloc[i]
+		sv_type = df['SVTYPE'].iloc[i]
 
 		fasta_seq = fa_dict[chr_name].upper()
 
 		if sv_type == "SNP":
-			ref = str(fasta_seq.seq[start:end])
-			alt = sv_info[4]
+			ref = str(fasta_seq.seq[start])
+			alt = SNP(ref)
 
 		elif sv_type == "deletion":
 			end = start + get_random_len("DEL")
@@ -91,10 +102,10 @@ def get_seq(vcf_df, bed_df, fa_dict, output_file):
 			end = start + get_random_len("DUP")
 			ref = str(fasta_seq.seq[start])
 			alt_seq1 = str(fasta_seq.seq[start+1:end])
-			cp = int(sv_info[4])-1
+			cp = sv_info-1
 			# multiplication pour obtenir les copies
 			alt_seq = alt_seq1*cp
-			if sv_type =="inverted tandem duplication":
+			if sv_type == "inverted tandem duplication":
 				alt_seq = reverse(alt_seq)
 			
 			alt = ref + alt_seq
@@ -105,25 +116,22 @@ def get_seq(vcf_df, bed_df, fa_dict, output_file):
 			end = start + l
 			
 			# informations sur la translocation
-			trans_info = sv_info[4]
-			print(trans_info, type(trans_info))
-			infos = trans_info.split(":")
-			trans_start = int(infos[2])-1
+			trans_chr = sv_info[0]
+			trans_start = sv_info[1]-1
 			trans_end = trans_start + l
-			trans_chr = infos[1]
-
+			fasta_seq_trans = fa_dict[trans_chr].upper()
+			
 			# translocation réciproque
 			if sv_type == "reciprocal translocation":
 				ref = str(fasta_seq.seq[start:end])
-				fasta_seq_trans = fa_dict[trans_chr].upper()
 				ref2 = str(fasta_seq_trans.seq[trans_start:trans_end])
 				
-				if infos[3] == "reverse":
+				if sv_info[2] == "reverse":
 					alt = str(fasta_seq.seq[start]) + reverse(str(fasta_seq_trans.seq[trans_start+1:trans_end]))
 				else :
 					alt = str(fasta_seq.seq[start]) + str(fasta_seq_trans.seq[trans_start+1:trans_end])
 
-				if infos[4] == "reverse":
+				if sv_info[3] == "reverse":
 					alt2 = str(fasta_seq_trans.seq[trans_start]) + reverse(str(fasta_seq.seq[start+1:end]))
 				else:
 					alt2 = str(fasta_seq_trans.seq[trans_start]) + str(fasta_seq.seq[start+1:end])
@@ -136,7 +144,7 @@ def get_seq(vcf_df, bed_df, fa_dict, output_file):
 				
 				# insertion
 				ref2 = str(fasta_seq_trans.seq[trans_start])
-				if infos[3] == "reverse":
+				if sv_info[2] == "reverse":
 					alt2 = ref2 + reverse(str(fasta_seq.seq[start+1:end]))
 				else :
 					alt2 = ref2 + str(fasta_seq.seq[start+1:end])
@@ -144,7 +152,7 @@ def get_seq(vcf_df, bed_df, fa_dict, output_file):
 			# copier-coller
 			else:
 				ref = str(fasta_seq.seq[start])
-				alt = vcf_df["ALT"].iloc[i]
+				alt = SNP(ref)
 
 				ref2 = str(fasta_seq_trans.seq[trans_start])
 				alt2 = ref2 + str(fasta_seq.seq[start+1:end])
@@ -154,14 +162,19 @@ def get_seq(vcf_df, bed_df, fa_dict, output_file):
 			new_vcf_var[1] = trans_start+1
 			new_vcf_var[3] = ref2
 			new_vcf_var[4] = alt2
-			vcf_df.loc[len(vcf_df)] = new_vcf_var
+			save_series = pd.Series(new_vcf_var, index=cols)
+			series.append(save_series)
+
+		set_ref_alt(ref, alt, i, df)
 
-		set_ref_alt(ref, alt, i, vcf_df)
+	df_add = pd.DataFrame(series)
+	df = pd.concat([df, df_add], ignore_index=True)
+	df = df.drop(['SVTYPE', 'SVINFO'], axis=1)
 
 	# remove unnecessary columns for VCF
-	vcf_df["ID"] = "."
-	vcf_df["QUAL"] = 99
+	df["ID"] = "."
+	df["QUAL"] = 99
 	# output VCF
 	if not os.path.exists("results"):
 		os.mkdir("results")
-	vcf_df.to_csv("results/" + output_file + ".vcf", sep="\t", header=False, index=False)
\ No newline at end of file
+	df.to_csv("results/" + output_file + ".vcf", sep="\t", header=False, index=False)
\ No newline at end of file
diff --git a/fusion.py b/fusion.py
deleted file mode 100644
index ea0696c006791743574da20139f30d3ae4f0d7d4..0000000000000000000000000000000000000000
--- a/fusion.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# fusionner le VCF de msprime (position, génotype) et le BED de VISOR
-# pour obtenir un BED avec tous les variants
-# récupérer les variants et les génotypes msprime
-import io, os, shutil
-import pandas as pd
-
-def read_vcf(input_file):
-	with open(input_file, "r") as txt:
-		t = txt.read()
-	txt.close()
-
-	# write header for later
-	if not os.path.exists("results"):
-		os.mkdir("results")
-	f = open("results/vcf_header.txt", "w")
-	f.write(''.join(t.splitlines(keepends=True)[:6]))
-	f.close()
-
-	# remove VCF header for dataframe
-	t = ''.join(t.splitlines(keepends=True)[5:])
-	df = pd.read_table(io.StringIO(t))
-	df = df.rename(columns={"#CHROM" : "CHROM"})
-	return(df)
-
-def read_BED(input_file):
-	colnames=["chr", "start", "end", "type","info","breakpoint"]
-	df = pd.read_table(input_file, header=None, names=colnames)
-	return(df)
-
-def replace_bed_col(input_BED, input_VCF):
-	vcf = read_vcf(input_VCF)
-	bed = read_BED(input_BED)
-	if len(bed) == len(vcf):
-		bed["chr"] = vcf["CHROM"]
-		bed["start"] = vcf["POS"]
-	return(bed)
-
-def merge_final_vcf(header_file, content_file, merged_file):
-    with open(merged_file, 'wb') as outfile:
-        for filename in [header_file, content_file]:
-            with open(filename, 'rb') as infile:
-                shutil.copyfileobj(infile, outfile)
\ No newline at end of file
diff --git a/main.py b/main.py
deleted file mode 100644
index da8b303e1873f857b436809ca1336aff42301de5..0000000000000000000000000000000000000000
--- a/main.py
+++ /dev/null
@@ -1,45 +0,0 @@
-from bed2vcf import read_fa, get_seq
-from fusion import *
-from tree_generation import msprime_vcf
-import defopt
-
-## n : length of variant
-def sv_vcf(bed: str, vcf: str, fasta: str, outName: str):
-	"""
-    Create a VCF with structural variants.
-    
-    :parameter bed: BED file of structural variants to include
-    :parameter vcf: VCF generated with msprime --> tree generation script
-	:parameter fasta: Reference FASTA for the VCF and the variants
-	:parameter outName: Output name
-    
-    """
-	bed_df = replace_bed_col(bed, vcf)
-	vcf_df=read_vcf(vcf)
-	fa = read_fa(fasta)
-
-	get_seq(vcf_df, bed_df, fa, outName)
-
-	header_file="results/vcf_header.txt"
-	content_file = "results/" + outName + ".vcf"
-	merged_file="results/" + outName + "_final.vcf"
-	merge_final_vcf(header_file, content_file, merged_file)
-
-if __name__ == '__main__':
-    defopt.run([sv_vcf, msprime_vcf])
-
-bed = "test/mini_random.bed"
-vcf = "test/output.vcf"
-fasta = "/home/sukanya/tests/02_data/hackathon_Ztritici/CHR8/ztIPO323.chr8.fasta"
-
-output_file="tritici_test"
-# main(bed, vcf, fasta, output_file)
-
-### generate msprime population VCF
-# fai = "ztIPO323.chr8.fasta.fai"
-
-# ## samtools FAI
-# ## population size
-# ## mutation rate
-# ## sample size
-# msprime_vcf(fai, 5000, 1e-8, 60)
\ No newline at end of file
diff --git a/randombed.py b/randombed.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f15a719a596e4a1eb6c5bfc722b159e9bf29833
--- /dev/null
+++ b/randombed.py
@@ -0,0 +1,72 @@
+import pandas as pd
+import yaml, random, re
+
+## read yaml file containing variants informations
+def read_yaml(f):
+	stream = open(f, 'r')
+	d = yaml.safe_load(stream)
+	stream.close()
+	if sum(d.values()) != 100:
+		raise ValueError("Sum of values must be 100")
+	return(d)
+
+infos_dict = {'deletion': None, 'insertion': None, 'inversion': None, 'SNP': None,
+		 'tandem duplication': 2, 'inverted tandem duplication': 2, 
+		 'translocation copy-paste': 0, 'translocation cut-paste': 0, 'reciprocal translocation': 0}
+
+## generate info field for translocations
+# randomly select chromosome to add sequence to
+def select_chr(fai):
+	df = pd.read_table(fai, header=None, usecols=[0,1], names =["name", "length"])
+	names=df["name"].to_list()
+	chr = random.choice(names)
+	df = pd.pivot_table(df, columns='name')
+	len = df.iloc[0][chr]
+	pos = random.randrange(len)
+	li = [chr, pos]
+	return(li)
+
+# randomly select sequence orientation
+def select_orientation(li):
+	li.append(random.choice(["forward", "reverse"]))
+	return(li)
+
+## create dataframe with all the variants to simulate
+def generate_type(nvar, yml, fai):
+	d = read_yaml(yml)
+	## list for variant type and info
+	vartype = []
+	infos = []
+	## filter dict to keep value != 0
+	dict = {k: v for k, v in d.items() if v != 0}
+	for k in dict:
+		# generate n variants
+		perc = dict[k]
+		n = round(perc*nvar/100)
+		# add info field
+		if re.search("translocation", k):
+			for i in range(n):
+				info = select_chr(fai)
+				info = select_orientation(info)		
+				if k == "reciprocal translocation":
+					info = select_orientation(info)
+				infos.extend([info])
+				i += 1
+		else:
+			info = infos_dict[k]
+			infos.extend([info]*n)
+		vartype.extend([k]*n)
+	
+	if len(vartype) != nvar:
+		n = nvar - len(vartype)
+		vartype.extend(["SNP"]*n)
+		infos.extend([None]*n)
+
+	for_df = {'SVTYPE': vartype, 'SVINFO': infos}
+
+	df = pd.DataFrame(data = for_df)
+	# shuffle values in dataframe
+	df = df.sample(frac = 1, ignore_index=True)
+	# print(df)
+	df.to_csv("random_var.tsv", sep="\t", index=False)
+	return(df)
diff --git a/tree_generation.py b/tree_generation.py
index 68b67f3e910c0aedde3f57d1ae629eb33022f610..223b71d494e4463a9aa380114da056c913144288 100644
--- a/tree_generation.py
+++ b/tree_generation.py
@@ -1,6 +1,7 @@
 import pandas as pd
-import math, msprime
+import math, msprime, argparse, os
 import numpy as np
+from multiprocessing import Process
 
 def chrom_pos(df):
     l=df["length"].to_list()
@@ -28,19 +29,42 @@ def msprime_map(df):
     rate_map=msprime.RateMap(position=map_positions, rate=rates)
     return(rate_map)
 
+### ts_chroms = list of trees for each chromosome
+### names = list of chromosome name
+def output_files(ts_chroms, names, outdir="results"):
+    if not os.path.exists(outdir):
+            os.mkdir(outdir)
+
+        ## write vcf
+    filename = outdir + "/msprime_simulation.vcf"
+    for i in range(len(ts_chroms)):
+        txt = ts_chroms[i].as_vcf(contig_id=names[i])
+        t = ''.join(txt.splitlines(keepends=True)[6:])
+        # filename = 'tmp_vcf/output' + str(i) + ".vcf"
+        if i==0:
+            header_top = ''.join(txt.splitlines(keepends=True)[:4])
+            header_bot = ''.join(txt.splitlines(keepends=True)[4:6])
+            with open(filename, "w") as vcf_file:
+                vcf_file.write(t)
+                # ts_chroms[i].write_vcf(vcf_file, contig_id=names[i])
+        else:
+            header_top += txt.splitlines(keepends=True)[3]
+            
+            with open(filename, "a") as vcf_file:
+                vcf_file.write(t)
+            vcf_file.close()
+    ## write header to file
+    header = header_top + header_bot
+    with open(outdir + "/header.vcf", "w") as head:
+        head.write(header)
+    head.close()
+
 # input_fai = samtools FAI of FASTA where variants will be generated
 # pop_size = population size
 # mut_rate = mutation rate
 # n = sample size / number of indiv
-def msprime_vcf(fai: str, pop_size: int, mut_rate: float, n: int):
-    """
-    Generate VCF for each chromosome in reference FASTA.
-
-    :parameter fai: FAI samtools index of reference FASTA
-    :parameter pop_size: population size (Ne)
-    :parameter mut_rate: mutation rate (µ)
-    :parameter n: sample size
-    """
+def msprime_vcf(fai, pop_size, mut_rate, n, outdir):
+
     df = pd.read_table(fai, header=None, usecols=[0,1], names =["name", "length"])
     rate_map=msprime_map(df)
 
@@ -55,7 +79,7 @@ def msprime_vcf(fai: str, pop_size: int, mut_rate: float, n: int):
     mutated_ts = msprime.sim_mutations(
         ts, rate=mut_rate, 
         # random_seed=5678, 
-        discrete_genome=False)
+        discrete_genome=True)
 
     ts_chroms = []
 
@@ -69,20 +93,26 @@ def msprime_vcf(fai: str, pop_size: int, mut_rate: float, n: int):
         start, end = chrom_positions[j: j + 2]
         chrom_ts = mutated_ts.keep_intervals([[start, end]], simplify=False).trim()
         ts_chroms.append(chrom_ts)
-        print(chrom_ts.sequence_length)
 
     names=df["name"].to_list()
 
-    for i in range(len(ts_chroms)):
-        filename="msprime_output_chr.vcf"
-        if i==0:
-            with open(filename, "w") as vcf_file:
-                ts_chroms[i].write_vcf(vcf_file, contig_id=names[i])
-        else:
-            txt = ts_chroms[i].as_vcf(contig_id=names[i])
-            t = ''.join(txt.splitlines(keepends=True)[6:])
-            with open(filename, "a") as vcf_file:
-                vcf_file.write(t)
-            vcf_file.close()
+    ## create files    
+    if outdir == None:
+        output_files(ts_chroms, names)
+    else:
+        output_files(ts_chroms, names, outdir)
+
+parser = argparse.ArgumentParser(description='Generate VCF for each chromosome in reference FASTA.')
+parser.add_argument('-fai', '--fai', type=str, required = True, help='FAI samtools index of reference FASTA')
+parser.add_argument('-p', '--popSize', type=int, required = True, help='population size (Ne)')
+parser.add_argument('-r', '--rate', type=float, required = True, help='mutation rate (µ)')
+parser.add_argument('-n', '--sampleSize', type=int, required = True, help='sample size')
+parser.add_argument('-o', '--outDir', type=str, help='output directory')
 
-    print(len(ts_chroms))
\ No newline at end of file
+if __name__ == '__main__':
+	args = parser.parse_args()
+	d = vars(args)
+	li = list(d.values())
+	p = Process(target=msprime_vcf, args=(li))
+	p.start()
+	p.join()
\ No newline at end of file
diff --git a/variants_generation.py b/variants_generation.py
new file mode 100644
index 0000000000000000000000000000000000000000..df622cbf6a3a96a2a46643840382cfbe286c4cea
--- /dev/null
+++ b/variants_generation.py
@@ -0,0 +1,32 @@
+from bed2vcf import read_fa, get_seq
+from randombed import generate_type
+import argparse
+import pandas as pd
+from multiprocessing import Process
+
+## n : length of variant
+def sv_vcf(vcf, fasta, fai, yml, outName):
+	vcf_df = pd.read_table(vcf, sep="\t")
+	nvar = len(vcf_df)
+	bed_df = generate_type(nvar, yml, fai)
+	fa = read_fa(fasta)
+
+	get_seq(vcf_df, bed_df, fa, outName)
+
+# parse arguments
+parser = argparse.ArgumentParser(description='create final VCF')
+# parser.add_argument('-b','--bed', type=str, required = True, help='BED file of structural variants to include')
+parser.add_argument('-v', '--vcf', type=str, required = True, help='VCF generated with msprime --> tree generation script')
+parser.add_argument('-fa', '--fasta', type=str, required = True, help='Reference FASTA for the VCF and the variants')
+parser.add_argument('-fai', '--fai', type=str, required = True, help='Samtools index of reference FASTA')
+parser.add_argument('-y', '--yaml', type=str, required = True, help='Reference FASTA for the VCF and the variants')
+parser.add_argument('-o', '--outName', type=str, required = True, help='Output name')
+# parser.add_argument('--header', type=int, help='vcf header size')
+
+if __name__ == '__main__':
+	args = parser.parse_args()
+	d = vars(args)
+	li = list(d.values())
+	p = Process(target=sv_vcf, args=(li))
+	p.start()
+	p.join()
\ No newline at end of file