+ changed check for intial setting of old_w. (matrix != None does not work in cvxopt)
[qpalma.git] / qpalma / tools / parseGff.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import sys
5 import os.path
6 import csv
7 import re
8 from PyGff import *
9 import cPickle
10 import copy
11
12 def parse_gff(gff_filename):
13 gff_fid = open(gff_filename)
14 reader = csv.reader(gff_fid, delimiter='\t', quoting=csv.QUOTE_NONE)
15
16 allGenes = {}
17 allGenesA = []
18 currentGene = None
19
20 rx = re.compile('ID=[^;]*;',re.DOTALL)
21
22 for row in reader:
23 assert len(row) == 9
24 chr = row[0]
25 id = row[2]
26 start = int(row[3])
27 stop = int(row[4])
28 strand = row[6]
29 desc = row[8]
30
31 if id == 'chromosome':
32 continue
33
34 if id == 'gene':
35 if currentGene != None:
36 allGenes[currentGene.id] = currentGene
37 allGenesA.append(currentGene)
38
39 desc = rx.search(desc).group()[3:-1]
40 #print desc
41 currentGene = Gene(chr,start,stop,strand,desc)
42
43 elif id == 'five_prime_UTR':
44 pass
45
46 elif id == 'three_prime_UTR':
47 pass
48
49 elif id == 'mRNA':
50 pass
51
52 elif id == 'exon':
53 assert currentGene != None
54 currentGene.addExon(start,stop)
55
56 elif id == 'CDS':
57 pass
58
59 elif id == 'ncRNA':
60 pass
61
62 elif id == 'pseudogenic_exon':
63 pass
64
65 elif id == 'pseudogenic_transcript':
66 pass
67
68 elif id == 'miRNA':
69 pass
70
71 elif id == 'rRNA':
72 pass
73
74 elif id == 'snoRNA':
75 pass
76
77 elif id == 'snRNA':
78 pass
79
80 elif id == 'tRNA':
81 pass
82
83 elif id == 'pseudogene':
84 if currentGene != None:
85 allGenes[currentGene.id] = currentGene
86 allGenesA.append(currentGene)
87 currentGene = None
88 else:
89 assert False, 'Error: Unknown identifier \'%s\'' % id
90
91 if currentGene != None:
92 allGenes[currentGene.id] = currentGene
93
94 return allGenes# ,allGenesA
95
96
97 def createGffPickle(annotFile,pickleFile):
98 #gff_fid = open(annotFile)
99 pickle_fid = open(pickleFile,'w+')
100 allGenes = parse_gff(annotFile)
101 #for key,val in allGenes.iteritems():
102 #print key
103 cPickle.dump(allGenes,pickle_fid)
104
105 if __name__ == '__main__':
106 assert len(sys.argv) >= 3
107 annotFile = sys.argv[1]
108 pickleFile = sys.argv[2]
109 assert os.path.exists(annotFile)
110 assert not os.path.exists(pickleFile)
111
112 createGffPickle(annotFile,pickleFile)