git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@8599 e1793c9e...
[qpalma.git] / scripts / SOAPParser.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import sys
5 import os.path
6 import pdb
7
8 """
9
10 This explanation was take from the SOAP website ( http://soap.genomics.org.cn/ )
11
12 One line for One hit. The columns are separated by '\t'.
13
14 1)id: id of read;
15
16 2)sequence: full sequence of read. the read will be converted to the complementary sequence if mapped on the
17 reverse chain of reference;
18
19 3)quality: quality of sequence. corresponding to sequence, to be consistent with seq, it will be converted
20 too if mapped on reverse chain;
21
22 4)num. of hits: number of equal best hits. the reads with no hits will be ignored;
23
24 5)a/b: flag only meaningful for pair-end alignment, to distinguish which file the read is belonging to;
25
26 6)length: length of the read, if aligned after trimming, it will report the information of trimmed read;
27
28 7)+/-: alignment on the direct(+) or reverse(-) chain of the reference;
29
30 8)chr: id of reference sequence;
31
32 9)location: location of first bp on the reference, counted from 1;
33
34 10)types: type of hits.
35
36 "0": exact match
37 "1~100 RefAllele->OffsetQueryAlleleQual": number of mismatches, followed by detailed mutation sites and
38 switch of allele types. Offset is relative to the initial location on reference. 'OffsetAlleleQual': offset,
39 allele, and quality.
40
41 Example: "2 A->10T30 C->13A32" means there are two mismatches, one on location+10 of
42 reference, and the other on location+13 of reference. The allele on reference is A and C respectively, while
43 query allele type and its quality is T,30 and A,32.
44
45 "100+n Offset": n-bp insertion on read. Example: "101 15" means 1-bp insertion on read, start after
46 location+15 on reference.
47
48 "200+n Offset": n-bp deletion on read. Example: "202 16" means 2-bp deletion on query, start after 16bp on reference
49 """
50
51 class SOAPParser:
52
53 def __init__(self,filename):
54 assert os.path.exists(filename)
55 self.fh = open(filename)
56
57 def parseLine(self,line):
58 """
59 We assume that a line has entries as described above.
60 """
61
62 line = line.strip()
63 id,seq,q,num_hits,flag,length,align_strand,chrom,loc,type = line.split('\t')[:10]
64 sites = line.split('\t')[10:]
65 id = int(id)
66
67 seq=seq.lower()
68
69 line_d = {'id':id, 'seq':seq, 'num_hits':num_hits, 'flag':flag,\
70 'length':length, 'align_strand':align_strand, 'chrom':chrom,\
71 'location':loc, 'type':type, 'sites':sites }
72
73 return line_d
74
75
76 def parse(self):
77 entries = {}
78
79 for elem in self.fh:
80 line_d = self.parseLine(elem)
81 id = line_d['id']
82 try:
83 entries[id] = [line_d]
84 except:
85 old_entry = entries[id]
86 old_entry.append(line_d)
87 entries[id] = old_entry
88
89 return entries
90
91
92 if __name__ == '__main__':
93 soap_p = SOAPParser('/fml/ag-raetsch/home/fabio/projects/soap_1.07/fabio_out_s8_g3_w20.soap')
94 soap_p.parse()