+ added read structure for new parser
[qpalma.git] / qpalma / tools / parseGff.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import sys
5 import os.path
6 import csv
7 import re
8 from PyGff import *
9 import cPickle
10 import copy
11
12 def parse(gff_fid):
13 reader = csv.reader(gff_fid, delimiter='\t', quoting=csv.QUOTE_NONE)
14
15 allGenes = {}
16 currentGene = None
17
18 rx = re.compile('ID=[^;]*;',re.DOTALL)
19
20 for row in reader:
21 assert len(row) == 9
22 chr = row[0]
23 id = row[2]
24 start = int(row[3])
25 stop = int(row[4])
26 strand = row[6]
27 desc = row[8]
28
29 if id == 'chromosome':
30 continue
31
32 if id == 'gene':
33 if currentGene != None:
34 allGenes[currentGene.id] = currentGene
35
36 desc = rx.search(desc).group()[3:-1]
37 #print desc
38 currentGene = Gene(chr,start,stop,strand,desc)
39
40 elif id == 'five_prime_UTR':
41 pass
42
43 elif id == 'three_prime_UTR':
44 pass
45
46 elif id == 'mRNA':
47 pass
48
49 elif id == 'exon':
50 assert currentGene != None
51 currentGene.addExon(start,stop)
52
53 elif id == 'CDS':
54 pass
55
56 elif id == 'ncRNA':
57 pass
58
59 elif id == 'pseudogenic_exon':
60 pass
61
62 elif id == 'pseudogenic_transcript':
63 pass
64
65 elif id == 'miRNA':
66 pass
67
68 elif id == 'rRNA':
69 pass
70
71 elif id == 'snoRNA':
72 pass
73
74 elif id == 'snRNA':
75 pass
76
77 elif id == 'tRNA':
78 pass
79
80 elif id == 'pseudogene':
81 if currentGene != None:
82 allGenes[currentGene.id] = currentGene
83 currentGene = None
84 else:
85 assert False, 'Error: Unknown identifier \'%s\'' % id
86
87 if currentGene != None:
88 allGenes[currentGene.id] = currentGene
89
90 return allGenes
91
92
93 def createGffPickle(annotFile,pickleFile):
94 gff_fid = open(annotFile)
95 pickle_fid = open(pickleFile,'w+')
96 allGenes = parse(gff_fid)
97 #for key,val in allGenes.iteritems():
98 #print key
99 cPickle.dump(allGenes,pickle_fid)
100
101 if __name__ == '__main__':
102 assert len(sys.argv) >= 3
103 annotFile = sys.argv[1]
104 pickleFile = sys.argv[2]
105 assert os.path.exists(annotFile)
106 assert not os.path.exists(pickleFile)
107
108 createGffPickle(annotFile,pickleFile)