+ wrote faster parser / file processing tools in C
[qpalma.git] / tools / data_tools / parseGff.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import sys
5 import os.path
6 import csv
7 from PyGff import *
8 import cPickle
9 import copy
10
11 def parse(gff_fid,pickle_fid):
12 reader = csv.reader(gff_fid, delimiter='\t', quoting=csv.QUOTE_NONE)
13
14 allGenes = []
15 currentGene = None
16
17 for row in reader:
18 assert len(row) == 9
19 chr = row[0]
20 id = row[2]
21 start = int(row[3])
22 stop = int(row[4])
23 strand = row[6]
24
25 if id == 'chromosome':
26 continue
27
28 if id == 'gene':
29 if currentGene != None:
30 allGenes.append(currentGene)
31
32 currentGene = Gene(chr,start,stop,strand)
33
34 elif id == 'five_prime_UTR':
35 pass
36
37 elif id == 'three_prime_UTR':
38 pass
39
40 elif id == 'mRNA':
41 pass
42
43 elif id == 'exon':
44 assert currentGene != None
45 currentGene.addExon(start,stop)
46
47 elif id == 'CDS':
48 pass
49
50 elif id == 'ncRNA':
51 pass
52
53 elif id == 'pseudogenic_exon':
54 pass
55
56 elif id == 'pseudogenic_transcript':
57 pass
58
59 elif id == 'miRNA':
60 pass
61
62 elif id == 'rRNA':
63 pass
64
65 elif id == 'snoRNA':
66 pass
67
68 elif id == 'snRNA':
69 pass
70
71 elif id == 'tRNA':
72 pass
73
74 elif id == 'pseudogene':
75 if currentGene != None:
76 allGenes.append(currentGene)
77 currentGene = None
78 else:
79 assert False, 'Error: Unknown identifier \'%s\'' % id
80
81 if currentGene != None:
82 allGenes.append(currentGene)
83
84 cPickle.dump(allGenes,pickle_fid)
85
86 if __name__ == '__main__':
87 assert len(sys.argv) >= 3
88 annotFile = sys.argv[1]
89 pickleFile = sys.argv[2]
90 assert os.path.exists(annotFile)
91 assert not os.path.exists(pickleFile)
92
93 gff_fid = open(annotFile)
94 pickle_fid = open(pickleFile,'w+')
95 parse(gff_fid,pickle_fid)