Skip to content
Snippets Groups Projects
Commit fab0779b authored by Helene Rimbert's avatar Helene Rimbert
Browse files

new script to reformat output of gmap for whole genome results. Gmap create 1...

new script to reformat output of gmap for whole genome results. Gmap create 1 feature gene per mrna. This script merges all mrna features under the same gene feature
parent e51b6a52
Branches
Tags v0.3
No related merge requests found
#!/usr/bin/env python3
# coding: utf-8
import os.path
from os import path
from pprint import pprint
from collections import defaultdict
import argparse
import sys
import re
class reformatGmapGff (object):
def __init__(self):
"""
Global var
"""
self.chromosomeMap=defaultdict()
self.knownGenes=[]
self.knownMrna=[]
self.geneIsoforms=defaultdict()
self.geneFeatures=defaultdict()
self.mrnaFeatures=defaultdict()
def main(self):
self.getOptions()
self.checkInputs()
# load inputs
self.loadChromosomeMap()
# reformat gmap gff3 output
self.outputFH=open(self.options.output, 'w')
print('##gff-version 3', file=self.outputFH)
self.errorFH=open(self.options.error, 'w')
print('##gff-version 3', file=self.errorFH)
self.reformatGff()
self.writeOutputGff()
self.outputFH.close()
self.errorFH.close()
def writeOutputGff(self):
for gene in self.knownGenes:
print('###', file=self.outputFH)
gff_gene=[self.geneFeatures[gene]['chrom'],
self.geneFeatures[gene]['source'],
self.geneFeatures[gene]['featureType'],
self.geneFeatures[gene]['start'],
self.geneFeatures[gene]['stop'],
self.geneFeatures[gene]['score'],
self.geneFeatures[gene]['strand'],
self.geneFeatures[gene]['frame'],
self.geneFeatures[gene]['attributes'] ]
print('\t'.join(map(str, gff_gene)), file=self.outputFH)
fake_gene_feature = 1
for isoform in self.geneIsoforms[gene]:
pprint(" working with gene {} isoform {}".format(gene, isoform))
for feature in self.mrnaFeatures[isoform]:
chromosome=feature[0]
if chromosome == self.geneFeatures[gene]['chrom']:
print("\t".join(map(str, feature)), file=self.outputFH)
else:
line="\t".join(map(str, feature))
featureId=self.getFeatureAttribute(gff=line, attribute='ID')
sys.stderr.write(' WARNING: Isoform {} is mapped on a different chromosome than gene {}. skipp isoform\n'.format(featureId, gene))
if feature[2] == 'mRNA' :
if fake_gene_feature == 1:
fakeGeneFeature=feature[0:2] + ['gene'] + feature[3:8] + ['ID='+gene]
print('###', file=self.errorFH)
print("\t".join(map(str, fakeGeneFeature)), file=self.errorFH)
fake_gene_feature = 0
print("\t".join(map(str, feature)), file=self.errorFH)
else:
print("\t".join(map(str, feature)), file=self.errorFH)
def reformatGff(self):
with open(self.options.input) as mygff:
for line in mygff.readlines():
if not line.startswith('#'):
featuretype=line.rstrip('\n').split('\t')[2]
featureId=self.getFeatureAttribute(gff=line, attribute='ID')
gff_array=line.rstrip('\n').split('\t')
if featuretype == 'gene':
# get only the gene name because here the ID corresponds to the mrna isoform which has been mapped
geneId=featureId.split('.')[0]
if geneId not in self.knownGenes:
self.knownGenes.append(geneId)
self.geneIsoforms[geneId]=[]
gff_array[8] = self.setGffAttributes(attributes=gff_array[8], new={'ID': geneId, 'Name': geneId})
self.geneFeatures[geneId] = {'chrom':gff_array[0],
'source': gff_array[1],
'featureType': gff_array[2],
'start': gff_array[3],
'stop': gff_array[4],
'score': gff_array[5],
'strand': gff_array[6],
'frame': gff_array[7],
'attributes': gff_array[8]}
elif featuretype == 'mRNA':
mrna=self.getFeatureAttribute(gff=line, attribute='ID')
self.knownMrna.append(mrna)
parent=mrna.split('.')[0]
self.geneIsoforms[parent].append(mrna)
gff_array[8] = self.setGffAttributes(attributes=gff_array[8], new={'Parent': parent})
self.mrnaFeatures[mrna]=[gff_array]
# check if this new mrna has start or stop different from the gene feature
if gff_array[3] < self.geneFeatures[parent]['start'] and gff_array[0] == self.geneFeatures[parent]['chrom']:
self.geneFeatures[parent]['start'] = gff_array[3]
sys.stderr.write(" Update start of gene {}\n".format(parent))
sys.stderr.write(" New gene length: {}\n".format(int(gff_array[4])-int(gff_array[3])+1))
if gff_array[4] > self.geneFeatures[parent]['stop'] and gff_array[0] == self.geneFeatures[parent]['chrom']:
self.geneFeatures[parent]['stop'] = gff_array[4]
sys.stderr.write(" Update stop of gene {}\n".format(parent))
sys.stderr.write(" New gene length: {}\n".format(int(gff_array[4])-int(gff_array[3])+1))
else:
featureId=self.getFeatureAttribute(gff=line, attribute='ID')
featureParent=self.getFeatureAttribute(gff=line, attribute='Parent')
self.mrnaFeatures[featureParent].append(line.rstrip('\n').split('\t'))
numgenes=len(self.knownGenes)
nummrna=len(self.knownMrna)
sys.stderr.write(" Found {} genes for {} mRNA in {} GFF3 file.\n".format(numgenes, nummrna, self.options.output))
def setGffAttributes(self, attributes, new):
attr_array=attributes.rstrip('\n').split(';')
attr_dict=defaultdict()
ordered_keys=[]
for attr in attr_array:
(key, val) = attr.split('=')
attr_dict[key] = val
ordered_keys.append(key)
for newAttr in new.keys():
attr_dict[newAttr] = new[newAttr]
output_array=[]
for attr in ordered_keys:
output_array.append(str(attr)+'='+str(attr_dict[attr]))
return ';'.join(map(str, output_array))
def getFeatureAttribute(self, gff, attribute):
attr_array=gff.rstrip('\n').split('\t')[8].split(';')
attr_dict=defaultdict()
for attr in attr_array:
(key, val) = attr.split('=')
attr_dict[key] = val
if attribute in attr_dict.keys():
return attr_dict[attribute]
else:
sys.stderr.write(' ERROR: no attribute {} in gff record {}'.format(attribute, gff))
sys.exit()
def loadChromosomeMap(self):
with open(self.options.map) as maprecord:
for line in maprecord.readlines():
if not line.startswith('#'):
(name, id) = line.rstrip('\n').split('\t')
self.chromosomeMap[name] = id
sys.stdout.write(' found {} chromosome records in file {}\n'.format(len(self.chromosomeMap.keys()),self.options.map ))
pprint(self.chromosomeMap)
def getOptions(self):
parser=argparse.ArgumentParser(description='Check If some genes are missing in the final annotation')
parser.add_argument('-i', '--input', help='Input GFF3 file to rename ', required=True)
parser.add_argument('-o', '--output', help='Output GFF3 file', required=True)
parser.add_argument('-e', '--error', help='Mrna with anchoring on different chromosome than gene feature', required=True)
parser.add_argument('-m', '--map', help='File of mapping of chromosome ID (two columns, chromosome Name<TAB>Chromosome code', required=True)
self.options=parser.parse_args()
sys.stdout.write(" Parameters : {}".format(self.options))
def checkInputs(self):
"""
check for input files and output directory
"""
sys.stdout.write(' Check input files:')
self.checkFile(file=self.options.input)
self.checkFile(file=self.options.map)
def checkFile (self,file):
if os.path.isfile(file):
sys.stdout.write(" File %s found\n" % str(file))
else:
sys.stdout.write(" ERROR: Cannot find file %s \n" % str(file))
sys.exit()
if __name__== "__main__":
run=reformatGmapGff()
run.main()
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment