+ modified filtering of reads
[qpalma.git] / qpalma / parsers.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 class ReadParser:
5 """
6 A base class for the Solexa reads parsers.
7 """
8
9 def __init__(self,filename):
10 self.fh = open(filename)
11
12 def __iter__(self):
13 return self
14
15 def next(self):
16 pass
17
18 def parse(self):
19 lines = []
20
21 for elem in self:
22 lines.append(elem)
23
24 return lines
25
26 class FilteredReadParser(ReadParser):
27 """
28 This class offers a parser for the reads that are created by the first
29 filtering step performed to cut and join new reads.
30 """
31
32 def __init__(self,filename):
33 ReadParser.__init__(filename)
34
35 def parseLine(self,line):
36 """
37 We assume that a line has the following entries:
38
39 read_nr,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop
40
41 """
42 id,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split()
43 splitpos = int(splitpos)
44 read_size = int(read_size)
45
46 prb = [ord(elem)-50 for elem in prb]
47 cal = [ord(elem)-64 for elem in cal]
48 chastity = [ord(elem)+10 for elem in chastity]
49
50 p_start = int(p_start)
51 exons_stop = int(exon_stop)
52 exon_start = int(exon_start)
53 p_stop = int(p_stop)
54
55 line_d = {'id':id, 'chr':chr, 'strand':strand, 'seq':seq, 'splitpos':splitpos,\
56 'read_size':read_size, 'prb':prb, 'cal_prb':cal_prb, 'chastity':chastity, 'gene_id':gene_id,\
57 'p_start':p_start, 'exon_stop':exon_stop, 'exon_start':exon_start, 'p_stop':p_stop}
58 return line_d
59
60 def next(self):
61 for line in self.fh:
62 line = line.strip()
63 yield self.parseLine(line)
64
65 raise StopIteration
66
67
68 class RemappedReadParser(ReadParser):
69 """
70 This class offers a parser for the reads that are remapped by the vmatch
71 utility.
72
73 According to the docu the entries are:
74
75 ID, Chromosome, Position, Orientation (D or P), Mismatches, Alignment length, Offset, Alignment
76
77 """
78
79 def __init__(self,filename):
80 ReadParser.__init__(filename)
81
82 def parseLine(self,line):
83 """
84 We assume that a line has the following entires:
85
86 """
87 id,chr,pos,strand,mismatches,align_len,offset,alignment = line.split()
88 pos = int(pos)
89 mismatches = int(mismatches)
90 align_len = int(align_len)
91 offset = int(offset)
92 line_d = {'id':id, 'chr':chr, 'pos':pos, 'strand':strand, 'mismatches':mismatches, 'align_len':align_len,\
93 'offset':offset, 'alignment':alignment}
94 return line_d
95
96 def next(self):
97 for line in self.fh:
98 line = line.strip()
99 yield self.parseLine(line)
100
101 raise StopIteration
102