+ refactored code further
[qpalma.git] / qpalma / parsers.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import pdb
5
6 class ReadParser:
7 """
8 A base class for the Solexa reads parsers.
9 """
10
11 def __init__(self,filename):
12 self.fh = open(filename)
13
14 def __iter__(self):
15 return self
16
17 def next(self):
18 pass
19
20 def parse(self):
21 lines = []
22
23 for elem in self:
24 lines.append(elem)
25
26 return lines
27
28 class FilteredReadParser(ReadParser):
29 """
30 This class offers a parser for the reads that are created by the first
31 filtering step performed to cut and join new reads.
32 """
33
34 def __init__(self,filename):
35 ReadParser.__init__(self,filename)
36
37 def parseLine(self,line):
38 """
39 We assume that a line has the following entries:
40
41 read_nr,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop
42
43 """
44 id,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split()
45 splitpos = int(splitpos)
46 read_size = int(read_size)
47
48 seq=seq.lower()
49
50 assert strand in ['D','P']
51
52 if strand == 'D':
53 strand = '+'
54
55 if strand == 'P':
56 strand = '-'
57
58 chr = int(chr)
59
60 prb = [ord(elem)-50 for elem in prb]
61 cal_prb = [ord(elem)-64 for elem in cal_prb]
62 chastity = [ord(elem)+10 for elem in chastity]
63
64 p_start = int(p_start)
65 exon_stop = int(exon_stop)
66 exon_start = int(exon_start)
67 p_stop = int(p_stop)
68
69 line_d = {'id':id, 'chr':chr, 'strand':strand, 'seq':seq, 'splitpos':splitpos,\
70 'read_size':read_size, 'prb':prb, 'cal_prb':cal_prb, 'chastity':chastity, 'gene_id':gene_id,\
71 'p_start':p_start, 'exon_stop':exon_stop, 'exon_start':exon_start, 'p_stop':p_stop}
72 return line_d
73
74 def next(self):
75 for line in self.fh:
76 line = line.strip()
77 yield self.parseLine(line)
78
79 raise StopIteration
80
81 def parse(self):
82 entries = {}
83
84 for elem in self.fh:
85 line_d = self.parseLine(elem)
86 id = line_d['id']
87 assert id not in entries, pdb.set_trace()
88 entries[id] = line_d
89
90 return entries
91
92 class RemappedReadParser(ReadParser):
93 """
94 This class offers a parser for the reads that are remapped by the vmatch
95 utility.
96
97 According to the docu the entries are:
98
99 ID, Chromosome, Position, Orientation (D or P), Mismatches, Alignment length, Offset, Alignment
100
101 """
102
103 def __init__(self,filename):
104 ReadParser.__init__(self,filename)
105
106 def parseLine(self,line):
107 """
108 We assume that a line has the following entires:
109
110 """
111 id,chr1,pos1,seq1,chr2,pos2,seq2,quality = line.split()
112
113 chr1 = int(chr1)
114 chr2 = int(chr2)
115
116 seq1=seq1.lower()
117 seq2=seq2.lower()
118
119 pos1 = int(pos1)
120 pos2 = int(pos2)
121
122 line_d = {'id':id, 'chr1':chr1, 'pos1':pos1, 'seq1':seq1, 'chr2':chr2,\
123 'pos2':pos2, 'seq2':seq2, 'quality':'quality'}
124
125 return line_d
126
127 def next(self):
128 for line in self.fh:
129 line = line.strip()
130 yield self.parseLine(line)
131
132 raise StopIteration
133
134 def parse(self):
135 entries = {}
136
137 for elem in self.fh:
138 line_d = self.parseLine(elem)
139 id = line_d['id']
140 try:
141 entries[id] = [line_d]
142 except:
143 old_entry = entries[id]
144 old_entry.append(line_d)
145 entries[id] = old_entry
146
147 return entries
148
149
150 class MapParser(ReadParser):
151 """
152 This class offers a parser for the reads that are remapped by the vmatch
153 utility.
154
155 According to the docu the entries are:
156
157 ID, Chromosome, Position, Orientation (D or P), Mismatches, Alignment length, Offset, Alignment
158
159 """
160
161 def __init__(self,filename):
162 ReadParser.__init__(self,filename)
163
164 def parseLine(self,line):
165 """
166 We assume that a line has the following entries:
167
168 """
169
170 id,chr,pos,strand,mismatches,length,offset,seq = line.split()
171
172 chr = int(chr)
173 pos = int(pos)
174
175 if strand == 'D':
176 strand = '+'
177
178 if strand == 'P':
179 strand = '-'
180
181 mismatches = int(mismatches)
182 length = int(length)
183 offset = int(offset)
184
185 seq=seq.lower()
186
187 line_d = {'id':id, 'chr':chr, 'pos':pos, 'strand':strand,\
188 'mismatches':mismatches, 'length':length, 'offset':offset,\
189 'seq':seq}
190
191 return line_d
192
193 def next(self):
194 for line in self.fh:
195 line = line.strip()
196 yield self.parseLine(line)
197
198 raise StopIteration
199
200 def parse(self):
201 entries = {}
202
203 for elem in self.fh:
204 line_d = self.parseLine(elem)
205 id = line_d['id']
206 try:
207 entries[id] = [line_d]
208 except:
209 old_entry = entries[id]
210 old_entry.append(line_d)
211 entries[id] = old_entry
212
213 return entries