d7d157673a001716b1962d6a778d4b26eca4ab8c
[qpalma.git] / qpalma / parsers.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import pdb
5 import os
6 import sys
7 import mmap
8 import resource
9
10 class ReadParser:
11 """
12 A base class for the Solexa reads parsers.
13 """
14
15 def __init__(self,filename):
16 self.fh = open(filename)
17
18 def __iter__(self):
19 return self
20
21 def next(self):
22 pass
23
24 def parse(self):
25 lines = []
26
27 for elem in self:
28 lines.append(elem)
29
30 return lines
31
32
33 def parse(filename):
34 entries = []
35
36 fh = open(filename)
37
38 all_lines = fh.read()
39 all_lines = all_lines.split('\n')
40
41 for parsed_line in all_lines:
42 if parsed_line == '':
43 continue
44
45 id,chr,pos,strand,mismatches,length,offset,seq,prb,cal_prb,chastity =\
46 parsed_line.split()
47 chr = int(chr)
48 pos = int(pos)
49
50 mismatches = int(mismatches)
51 length = int(length)
52 offset = int(offset)
53
54 line_d = {'id':id, 'chr':chr, 'pos':pos, 'strand':strand,\
55 'mismatches':mismatches, 'length':length, 'offset':offset,\
56 'seq':seq,'prb':prb,'cal_prb':cal_prb,'chastity':chastity}
57
58 entries.append((line_d,parsed_line))
59
60 return entries
61
62
63 def parse(self):
64 entries = {}
65
66 for elem in self.fh:
67 line_d = self.parseLine(elem)
68 id = line_d['id']
69 assert id not in entries, pdb.set_trace()
70 entries[id] = line_d
71
72 return entries
73
74 class RemappedReadParser(ReadParser):
75 """
76 This class offers a parser for the reads that are remapped by the vmatch
77 utility.
78
79 According to the docu the entries are:
80
81 ID, Chromosome, Position, Orientation (D or P), Mismatches, Alignment length, Offset, Alignment
82
83 """
84
85 def __init__(self,filename):
86 ReadParser.__init__(self,filename)
87
88 def parseLine(self,line):
89 """
90 We assume that a line has the following entires:
91
92 """
93 id,chr1,pos1,seq1,chr2,pos2,seq2,quality = line.split()
94
95 chr1 = int(chr1)
96 chr2 = int(chr2)
97
98 seq1=seq1.lower()
99 seq2=seq2.lower()
100
101 pos1 = int(pos1)
102 pos2 = int(pos2)
103
104 line_d = {'id':id, 'chr1':chr1, 'pos1':pos1, 'seq1':seq1, 'chr2':chr2,\
105 'pos2':pos2, 'seq2':seq2, 'quality':'quality'}
106
107 return line_d
108
109 def next(self):
110 for line in self.fh:
111 line = line.strip()
112 yield self.parseLine(line)
113
114 raise StopIteration
115
116 def parse(self):
117 entries = {}
118
119 for elem in self.fh:
120 line_d = self.parseLine(elem)
121 id = line_d['id']
122 try:
123 entries[id] = [line_d]
124 except:
125 old_entry = entries[id]
126 old_entry.append(line_d)
127 entries[id] = old_entry
128
129 return entries
130
131
132 class MapParser(ReadParser):
133 """
134 This class offers a parser for the reads that are remapped by the vmatch
135 utility.
136
137 According to the docu the entries are:
138
139 ID, Chromosome, Position, Orientation (D or P), Mismatches, Alignment length, Offset, Alignment
140
141 """
142
143 def __init__(self,filename):
144 ReadParser.__init__(self,filename)
145
146 def parseLine(self,line):
147 """
148 We assume that a line has the following entries:
149
150 """
151
152 id,chr,pos,strand,mismatches,length,offset,seq = line.split()
153
154 chr = int(chr)
155 pos = int(pos)
156
157 if strand == 'D':
158 strand = '+'
159
160 if strand == 'P':
161 strand = '-'
162
163 mismatches = int(mismatches)
164 length = int(length)
165 offset = int(offset)
166
167 seq=seq.lower()
168
169 line_d = {'id':id, 'chr':chr, 'pos':pos, 'strand':strand,\
170 'mismatches':mismatches, 'length':length, 'offset':offset,\
171 'seq':seq}
172
173 return line_d
174
175 def next(self):
176 for line in self.fh:
177 line = line.strip()
178 yield self.parseLine(line)
179
180 raise StopIteration
181
182 def parse(self):
183 entries = {}
184
185 for elem in self.fh:
186 line_d = self.parseLine(elem)
187 id = line_d['id']
188 try:
189 entries[id] = [line_d]
190 except:
191 old_entry = entries[id]
192 old_entry.append(line_d)
193 entries[id] = old_entry
194
195 return entries
196
197
198
199 def parse_map_vm_heuristic(filename):
200 """
201 This function parse the map.vm file for the PipelineHeuristics
202 """
203 entries = []
204
205 fh = open(filename)
206
207 all_lines = fh.read()
208 all_lines = all_lines.split('\n')
209
210 for parsed_line in all_lines:
211 if parsed_line == '':
212 continue
213
214 id,chr,pos,strand,mismatches,length,offset,seq,prb,cal_prb,chastity =\
215 parsed_line.split()
216 chr = int(chr)
217 pos = int(pos)
218
219 mismatches = int(mismatches)
220 length = int(length)
221 offset = int(offset)
222
223 line_d = {'id':id, 'chr':chr, 'pos':pos, 'strand':strand,\
224 'mismatches':mismatches, 'length':length, 'offset':offset,\
225 'seq':seq,'prb':prb,'cal_prb':cal_prb,'chastity':chastity}
226
227 entries.append((line_d,parsed_line))
228
229 return entries
230
231 def cpu():
232 return (resource.getrusage(resource.RUSAGE_SELF).ru_utime+\
233 resource.getrusage(resource.RUSAGE_SELF).ru_stime)
234
235 class Line:
236 pass
237
238 def parse_filtered_reads(filename):
239 """
240
241 """
242 entries = {}
243
244 #fh = open(filename)
245 #all_lines = fh.read()
246 #all_lines = all_lines.split('\n')
247
248 #for parsed_line in all_lines:
249 # if parsed_line == '':
250 # continue
251
252 print 'asking for map'
253 data = map_file(filename)
254 print 'obtained map'
255
256 strand_map = ['-','+']
257
258 full_line_split_time = 0
259 full_array_time = 0
260 full_dict_time = 0
261
262 parsing_start = cpu()
263
264 print 'start parsing'
265
266 while True:
267
268 start = cpu()
269 parsed_line = data.readline().strip()
270
271 if parsed_line == '':
272 break
273
274 id,chr,strand,seq,splitpos,read_size,_prb,_cal_prb,_chastity,gene_id,p_start,exon_stop,exon_start,p_stop,true_cut = parsed_line.split()
275 stop = cpu()
276
277 full_line_split_time += stop-start
278
279 id = int(id)
280 splitpos = int(splitpos)
281 read_size = int(read_size)
282
283 seq=seq.lower()
284
285 #assert strand in ['D','P']
286
287 strand = strand_map[strand == 'D']
288
289 chr = int(chr)
290
291 start_array_time = cpu()
292
293 prb = [0]*read_size
294 cal_prb = [0]*read_size
295 chastity = [0]*read_size
296
297 for idx in range(read_size):
298 prb[idx] = ord(_prb[idx])-50
299 cal_prb[idx] = ord(_cal_prb[idx])-64
300 chastity[idx] = ord(_chastity[idx])+10
301
302 #import array
303
304 #prb = array.array('b',_prb)
305 #cal_prb = array.array('b',_cal_prb)
306 #chastity = array.array('b',_chastity)
307
308 #for idx in range(read_size):
309 # prb[idx] -= 50
310 # cal_prb[idx] -= 64
311 # chastity[idx] += 10
312
313 #assert prb == __prb
314 #assert cal_prb == __cal_prb
315 #assert chastity == __chastity
316
317 stop_array_time = cpu()
318
319 full_array_time += stop_array_time - start_array_time
320
321
322 start_dict_time = cpu()
323
324 p_start = int(p_start)
325 exon_stop = int(exon_stop)
326 exon_start = int(exon_start)
327 p_stop = int(p_stop)
328 true_cut = int(true_cut)
329
330 line_d = {'id':id, 'chr':chr, 'strand':strand, 'seq':seq, 'splitpos':splitpos,\
331 'read_size':read_size, 'prb':prb, 'cal_prb':cal_prb, 'chastity':chastity, 'gene_id':gene_id,\
332 'p_start':p_start, 'exon_stop':exon_stop, 'exon_start':exon_start,\
333 'p_stop':p_stop,'true_cut':true_cut}
334
335 stop_dict_time = cpu()
336
337 full_dict_time += stop_dict_time - start_dict_time
338
339 entries[id] = line_d
340
341 parsing_stop = cpu()
342
343 print 'parsing took %f secs' % (stop-start)
344 print 'line split time %f ' % full_line_split_time
345 print 'array time %f ' % full_array_time
346 print 'dict time %f ' % full_dict_time
347
348 return entries
349
350
351 def parse_map_vm(filename):
352 """
353 Parses the map.vm files from Vmatch
354
355
356 For an original read we can have several vmatch locations
357 """
358 entries = {}
359
360 #fh = open(filename)
361 #all_lines = fh.read()
362 #all_lines = all_lines.split('\n')
363
364 #for parsed_line in all_lines:
365 # if parsed_line == '':
366 # continue
367
368 data = map_file(filename)
369
370 while True:
371
372 parsed_line = data.readline()
373
374 if parsed_line == '':
375 break
376
377 id,chr,pos,strand,mismatches,length,offset,seq,q1,q2,q3 = parsed_line.split()
378
379 id = int(id)
380
381 chr = int(chr)
382 pos = int(pos)
383
384 if strand == 'D':
385 strand = '+'
386
387 if strand == 'P':
388 strand = '-'
389
390 mismatches = int(mismatches)
391 length = int(length)
392 offset = int(offset)
393
394 seq=seq.lower()
395
396 line_d = {'id':id, 'chr':chr, 'pos':pos, 'strand':strand,\
397 'mismatches':mismatches, 'length':length, 'offset':offset}
398
399 if entries.has_key(id):
400 #pdb.set_trace()
401 old_entry = entries[id]
402 old_entry.append(line_d)
403 entries[id] = old_entry
404 else:
405 entries[id] = [line_d]
406
407 return entries
408
409
410 def map_file(filename):
411 """
412 As the mmapped file object behaves just like plain file objects
413 you do not have to unmap but just close the file.
414 """
415
416 file = open(filename, "r+")
417 size = os.path.getsize(filename)
418
419 data = mmap.mmap(file.fileno(), size, mmap.ACCESS_READ)
420 assert size == len(data), 'Could not map whole file at once!'
421
422 return data