git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@8610 e1793c9e...
[qpalma.git] / qpalma / parsers.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import pdb
5
6 class ReadParser:
7 """
8 A base class for the Solexa reads parsers.
9 """
10
11 def __init__(self,filename):
12 self.fh = open(filename)
13
14 def __iter__(self):
15 return self
16
17 def next(self):
18 pass
19
20 def parse(self):
21 lines = []
22
23 for elem in self:
24 lines.append(elem)
25
26 return lines
27
28 class FilteredReadParser(ReadParser):
29 """
30 This class offers a parser for the reads that are created by the first
31 filtering step performed to cut and join new reads.
32 """
33
34 def __init__(self,filename):
35 ReadParser.__init__(self,filename)
36
37 def parseLine(self,line):
38 """
39 We assume that a line has the following entries:
40
41 read_nr,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop
42
43 """
44 id,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop,true_cut = line.split()
45 splitpos = int(splitpos)
46 read_size = int(read_size)
47
48 seq=seq.lower()
49
50 assert strand in ['D','P']
51
52 if strand == 'D':
53 strand = '+'
54
55 if strand == 'P':
56 strand = '-'
57
58 chr = int(chr)
59
60 prb = [ord(elem)-50 for elem in prb]
61 cal_prb = [ord(elem)-64 for elem in cal_prb]
62 chastity = [ord(elem)+10 for elem in chastity]
63
64 p_start = int(p_start)
65 exon_stop = int(exon_stop)
66 exon_start = int(exon_start)
67 p_stop = int(p_stop)
68 true_cut = int(true_cut)
69
70 line_d = {'id':id, 'chr':chr, 'strand':strand, 'seq':seq, 'splitpos':splitpos,\
71 'read_size':read_size, 'prb':prb, 'cal_prb':cal_prb, 'chastity':chastity, 'gene_id':gene_id,\
72 'p_start':p_start, 'exon_stop':exon_stop, 'exon_start':exon_start,\
73 'p_stop':p_stop,'true_cut':true_cut}
74
75 return line_d
76
77 def next(self):
78 for line in self.fh:
79 line = line.strip()
80 yield self.parseLine(line)
81
82 raise StopIteration
83
84 def parse(self):
85 entries = {}
86
87 for elem in self.fh:
88 line_d = self.parseLine(elem)
89 id = line_d['id']
90 assert id not in entries, pdb.set_trace()
91 entries[id] = line_d
92
93 return entries
94
95 class RemappedReadParser(ReadParser):
96 """
97 This class offers a parser for the reads that are remapped by the vmatch
98 utility.
99
100 According to the docu the entries are:
101
102 ID, Chromosome, Position, Orientation (D or P), Mismatches, Alignment length, Offset, Alignment
103
104 """
105
106 def __init__(self,filename):
107 ReadParser.__init__(self,filename)
108
109 def parseLine(self,line):
110 """
111 We assume that a line has the following entires:
112
113 """
114 id,chr1,pos1,seq1,chr2,pos2,seq2,quality = line.split()
115
116 chr1 = int(chr1)
117 chr2 = int(chr2)
118
119 seq1=seq1.lower()
120 seq2=seq2.lower()
121
122 pos1 = int(pos1)
123 pos2 = int(pos2)
124
125 line_d = {'id':id, 'chr1':chr1, 'pos1':pos1, 'seq1':seq1, 'chr2':chr2,\
126 'pos2':pos2, 'seq2':seq2, 'quality':'quality'}
127
128 return line_d
129
130 def next(self):
131 for line in self.fh:
132 line = line.strip()
133 yield self.parseLine(line)
134
135 raise StopIteration
136
137 def parse(self):
138 entries = {}
139
140 for elem in self.fh:
141 line_d = self.parseLine(elem)
142 id = line_d['id']
143 try:
144 entries[id] = [line_d]
145 except:
146 old_entry = entries[id]
147 old_entry.append(line_d)
148 entries[id] = old_entry
149
150 return entries
151
152
153 class MapParser(ReadParser):
154 """
155 This class offers a parser for the reads that are remapped by the vmatch
156 utility.
157
158 According to the docu the entries are:
159
160 ID, Chromosome, Position, Orientation (D or P), Mismatches, Alignment length, Offset, Alignment
161
162 """
163
164 def __init__(self,filename):
165 ReadParser.__init__(self,filename)
166
167 def parseLine(self,line):
168 """
169 We assume that a line has the following entries:
170
171 """
172
173 id,chr,pos,strand,mismatches,length,offset,seq = line.split()
174
175 chr = int(chr)
176 pos = int(pos)
177
178 if strand == 'D':
179 strand = '+'
180
181 if strand == 'P':
182 strand = '-'
183
184 mismatches = int(mismatches)
185 length = int(length)
186 offset = int(offset)
187
188 seq=seq.lower()
189
190 line_d = {'id':id, 'chr':chr, 'pos':pos, 'strand':strand,\
191 'mismatches':mismatches, 'length':length, 'offset':offset,\
192 'seq':seq}
193
194 return line_d
195
196 def next(self):
197 for line in self.fh:
198 line = line.strip()
199 yield self.parseLine(line)
200
201 raise StopIteration
202
203 def parse(self):
204 entries = {}
205
206 for elem in self.fh:
207 line_d = self.parseLine(elem)
208 id = line_d['id']
209 try:
210 entries[id] = [line_d]
211 except:
212 old_entry = entries[id]
213 old_entry.append(line_d)
214 entries[id] = old_entry
215
216 return entries
217
218
219 class PipelineReadParser(ReadParser):
220 """
221 This class offers a parser for the reads that are remapped by the vmatch
222 utility.
223
224 According to the docu the entries are:
225
226 ID, Chromosome, Position, Orientation (D or P), Mismatches, Alignment length, Offset, Alignment
227
228 """
229
230 def __init__(self,filename):
231 ReadParser.__init__(self,filename)
232
233 def parseLine(self,line):
234 """
235 We assume that a line has the following entries:
236
237 """
238
239 #id,chr,pos,strand,mismatches,length,offset,seq,prb,cal_prb,chastity,orig_seq,is_spliced = line.split()
240 id,chr,pos,strand,mismatches,length,offset,seq,prb,cal_prb,chastity = line.split()
241
242 chr = int(chr)
243 pos = int(pos)
244
245 if strand == 'D':
246 strand = '+'
247
248 if strand == 'P':
249 strand = '-'
250
251 mismatches = int(mismatches)
252 length = int(length)
253 offset = int(offset)
254
255 seq=seq.lower()
256 #orig_seq=orig_seq.lower()
257
258 #if is_spliced == '1':
259 # is_spliced = True
260 #else:
261 # is_spliced = False
262
263 line_d = {'id':id, 'chr':chr, 'pos':pos, 'strand':strand,\
264 'mismatches':mismatches, 'length':length, 'offset':offset,\
265 'seq':seq,'prb':prb,'cal_prb':cal_prb,'chastity':chastity}
266 #'orig_seq':orig_seq,'is_spliced':is_spliced}
267
268 return line_d
269
270 def next(self):
271 for line in self.fh:
272 line = line.strip()
273 yield self.parseLine(line)
274
275 raise StopIteration
276
277 def parse(self):
278 entries = {}
279
280 for elem in self.fh:
281 line_d = self.parseLine(elem)
282 id = line_d['id']
283 try:
284 entries[id] = [line_d]
285 except:
286 old_entry = entries[id]
287 old_entry.append(line_d)
288 entries[id] = old_entry
289
290 return entries