diff --git a/.gitignore b/.gitignore index 91c1711040a5616a1e7f8c3081c8ad9bcefc9251..f7f2ef469c92ae120744e09ed58b10c34579b3a1 100644 --- a/.gitignore +++ b/.gitignore @@ -48,9 +48,6 @@ #!*.txt !*.yml !*.yaml -!Snakefile -!*.smk -!slurm_logs/ # 3) add a pattern to track the file patterns of section2 even if they are in # subdirectories diff --git a/bed2vcf.py b/bed2vcf.py index 3f76d2c0630a1b2a292bafbd4a6309d9be77dd69..13d59ad5cff4c0ca0c5a0a9a241425fdabcc94a2 100644 --- a/bed2vcf.py +++ b/bed2vcf.py @@ -35,24 +35,14 @@ def set_ref_alt(ref, alt, row, vcf_df): vcf_df.at[row, "ALT"] = alt def get_random_len(svtype): - # INV - if svtype == "INV": - rng = np.random.default_rng() - r = rng.random() - if r > 0.17: - s = np.random.uniform(250, 6000,1).round() - else: - s = np.random.uniform(6000, 45000,1).round() - # INS, DEL, DUP, CNV - else: - df = pd.read_csv("sv_distributions/size_distrib" + svtype + ".tsv", sep="\t") - pb = df["pb"].tolist() - li = np.random.multinomial(1, pb) - for i in range(len(li)): - if li[i] != 0: - interval = df["size_interval"].iloc[i] - interval = json.loads(interval) - s = np.random.uniform(interval[0], interval[1], 1).round() + # INS, DEL, DUP, CNV, INV + df = pd.read_csv("sv_distributions/size_distrib" + svtype + ".tsv", sep="\t") + pb = df["pb"].tolist() + li = np.random.multinomial(1, pb) + i = np.argmax(li) + interval = df["size_interval"].iloc[i] + interval = json.loads(interval) + s = np.random.uniform(interval[0], interval[1], 1).round() return(int(s[0])) # get the sequence of each variants in BED @@ -75,7 +65,7 @@ def get_seq(vcf_df, bed_df, fa_dict, output_file): start = start-1 # pour ajuster à l'index python sv_type = sv_info[3] - fasta_seq = fa_dict[chr_name] + fasta_seq = fa_dict[chr_name].upper() if sv_type == "deletion": end = start + get_random_len("DEL") @@ -121,7 +111,7 @@ def get_seq(vcf_df, bed_df, fa_dict, output_file): # translocation réciproque if sv_type == "reciprocal translocation": ref = str(fasta_seq.seq[start:end]) - fasta_seq_trans = fa_dict[trans_chr] + fasta_seq_trans = fa_dict[trans_chr].upper() ref2 = str(fasta_seq_trans.seq[trans_start:trans_end]) if infos[3] == "reverse": @@ -166,15 +156,8 @@ def get_seq(vcf_df, bed_df, fa_dict, output_file): # remove unnecessary columns for VCF vcf_df["ID"] = "." - - # adjust variant start position to include reference - # vcf_df["POS"] = vcf_df["POS"] - 1 ### Attention aux translocation avec le nouveau start ? - + vcf_df["QUAL"] = 99 # output VCF if not os.path.exists("results"): os.mkdir("results") - vcf_df.to_csv("results/" + output_file, sep="\t", header=False, index=False) - - - -# vcf_header="##source=VISOR BED to VCF\n#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE" \ No newline at end of file + vcf_df.to_csv("results/" + output_file + ".vcf", sep="\t", header=False, index=False) \ No newline at end of file diff --git a/fusion.py b/fusion.py index fbd2de9dffbe7540f1f1a2070f420668283ae420..ea0696c006791743574da20139f30d3ae4f0d7d4 100644 --- a/fusion.py +++ b/fusion.py @@ -1,13 +1,21 @@ # fusionner le VCF de msprime (position, génotype) et le BED de VISOR # pour obtenir un BED avec tous les variants # récupérer les variants et les génotypes msprime -import io +import io, os, shutil import pandas as pd def read_vcf(input_file): with open(input_file, "r") as txt: t = txt.read() txt.close() + + # write header for later + if not os.path.exists("results"): + os.mkdir("results") + f = open("results/vcf_header.txt", "w") + f.write(''.join(t.splitlines(keepends=True)[:6])) + f.close() + # remove VCF header for dataframe t = ''.join(t.splitlines(keepends=True)[5:]) df = pd.read_table(io.StringIO(t)) @@ -25,6 +33,10 @@ def replace_bed_col(input_BED, input_VCF): if len(bed) == len(vcf): bed["chr"] = vcf["CHROM"] bed["start"] = vcf["POS"] - # piocher dans une distribution de taille de SV --> modéliser sur la distrib humain, etc - # bed["end"] = vcf["POS"] + len_SV - return(bed) \ No newline at end of file + return(bed) + +def merge_final_vcf(header_file, content_file, merged_file): + with open(merged_file, 'wb') as outfile: + for filename in [header_file, content_file]: + with open(filename, 'rb') as infile: + shutil.copyfileobj(infile, outfile) \ No newline at end of file diff --git a/main.py b/main.py index 0b47c0fea1021a91451eaf4a72a345b695ba2a37..82e271348e4121250f47ced98e3dad4f616c9573 100644 --- a/main.py +++ b/main.py @@ -15,11 +15,15 @@ def main(bed, vcf, fasta, output_file): bed = "test/mini_random.bed" vcf = "test/output.vcf" fasta = "/home/sukanya/tests/02_data/hackathon_Ztritici/CHR8/ztIPO323.chr8.fasta" -# n = 200 -output_file="60_tritici.vcf" +output_file="tritici_test" main(bed, vcf, fasta, output_file) +header_file="results/vcf_header.txt" +content_file = "results/" + output_file + ".vcf" +merged_file="results/" + output_file + "_final.vcf" +merge_final_vcf(header_file, content_file, merged_file) + ### generate msprime population VCF # fai = "ztIPO323.chr8.fasta.fai" diff --git a/random_bed.sh b/random_bed.sh index bb9cf618a5ba77298e99e621db062c782ba61541..be169a3957a89d2e002e6de2f53b46465ff68c05 100644 --- a/random_bed.sh +++ b/random_bed.sh @@ -1,23 +1,6 @@ ### create random BED -# refFAI="ztIPO323.chr8.fasta.fai" +refFAI="ztIPO323.chr8.fasta.fai" -# cut -f1,2 $refFAI > chrom.dim.tsv -# Rscript randomregion.r -d chrom.dim.tsv -n 2638 -v 'deletion,inversion,inverted tandem duplication,translocation copy-paste,reciprocal translocation' -r '40:20:20:10:10' | sortBed > HACk.random.bed - - -### vg construct -IMG="/home/sukanya/singularity/builddir/vg_v1.51.0.sif" -REF="/home/sukanya/tests/02_data/hackathon_Ztritici/CHR8/ztIPO323.chr8.fasta" -VCF="/home/sukanya/repos/00_to_forgemia/msprime_VISOR_fusionVCF/results/60_tritici.vcf" - -# $IMG vg construct -r $REF -v $VCF -S -f -m 250000000 -a >test/test.vg -$IMG vg construct -r $REF -v $VCF -S -f -m 250000000 >test/test.vg - -# # $IMG vg convert -# $IMG vg paths -F -v test/test.vg >test/test.fasta -# $IMG vg paths -E -x test/test.vg - -$IMG vg convert -f test/test.vg > test/test.gfa - -# $IMG vg deconstruct -p chr_8 test/test.gfa > test/test.vcf \ No newline at end of file +cut -f1,2 $refFAI > chrom.dim.tsv +Rscript randomregion.r -d chrom.dim.tsv -n 2638 -v 'deletion,inversion,inverted tandem duplication,translocation copy-paste,reciprocal translocation' -r '40:20:20:10:10' | sortBed > HACk.random.bed \ No newline at end of file diff --git a/sv_distributions/size_distribINV.tsv b/sv_distributions/size_distribINV.tsv new file mode 100644 index 0000000000000000000000000000000000000000..4f5ccb9f230409a111603b55b16c8f53f8e46f8c --- /dev/null +++ b/sv_distributions/size_distribINV.tsv @@ -0,0 +1,3 @@ +size_interval pb +[250.0, 6000.0] 0.83 +[6000.0, 50200.0] 0.17 \ No newline at end of file diff --git a/visor_sv_type.yaml b/visor_sv_type.yaml index 42a807560c0f26f52d743f597a5467876c3e876b..5aafc79c73da30d758fb338b1b5efd1b9373524e 100644 --- a/visor_sv_type.yaml +++ b/visor_sv_type.yaml @@ -14,8 +14,8 @@ translocation copy-paste: 0 translocation cut-paste : 0 reciprocal translocation: 0 SNP: 0 -MNP: 0 -tandem repeat contraction: 0 -tandem repeat expansion: 0 -perfect tandem repetition: 0 -approximate tandem repetition: 0 \ No newline at end of file +# MNP: 0 +# tandem repeat contraction: 0 +# tandem repeat expansion: 0 +# perfect tandem repetition: 0 +# approximate tandem repetition: 0 \ No newline at end of file