5329b8ef408a386596e4be59623a4402d62bff39
[qpalma.git] / tools / data_tools / parseGff.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import sys
5 import os.path
6 import csv
7 from PyGff import *
8 import cPickle
9
10 def parse(gff_fid,pickle_fid):
11 reader = csv.reader(gff_fid, delimiter='\t', quoting=csv.QUOTE_NONE)
12
13 allGenes = []
14 currentGene = None
15
16 for row in reader:
17 assert len(row) == 9
18 chr = row[0]
19 id = row[2]
20 start = int(row[3])
21 stop = int(row[4])
22 strand = row[6]
23
24 if id == 'chromosome':
25 continue
26
27 if id == 'gene':
28 if currentGene != None:
29 allGenes.append(currentGene)
30 currentGene = None
31
32 currentGene = Gene(chr,start,stop,strand)
33
34 elif id == 'five_prime_UTR':
35 pass
36
37 elif id == 'three_prime_UTR':
38 pass
39
40 elif id == 'mRNA':
41 pass
42
43 elif id == 'exon':
44 currentGene.addExon(start,stop)
45
46 elif id == 'CDS':
47 pass
48
49 elif id == 'ncRNA':
50 pass
51
52 elif id == 'pseudogenic_exon':
53 pass
54
55 elif id == 'pseudogenic_transcript':
56 pass
57
58 elif id == 'snoRNA':
59 pass
60
61 elif id == 'snRNA':
62 pass
63
64 elif id == 'miRNA':
65 pass
66
67 elif id == 'tRNA':
68 pass
69
70 elif id == 'pseudogene':
71 if currentGene != None:
72 allGenes.append(currentGene)
73 currentGene = None
74
75 else:
76 assert False, 'Error: Unknown identifier \'%s\'' % id
77
78 cPickle.dump(allGenes,pickle_fid)
79
80
81 if __name__ == '__main__':
82 assert len(sys.argv) >= 3
83 annotFile = sys.argv[1]
84 pickleFile = sys.argv[2]
85 assert os.path.exists(annotFile)
86 assert not os.path.exists(pickleFile)
87
88 gff_fid = open(annotFile)
89 pickle_fid = open(pickleFile,'w+')
90 parse(gff_fid,pickle_fid)