+ found bug which caused ~10 reads of overlap "0"
[qpalma.git] / tools / data_tools / parseSolexa.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import sys
5 import os.path
6 import cPickle
7 import csv
8 from Solexa import Read
9
10 def parse(solexa_fid,pickle_fid):
11
12 reader = csv.reader(solexa_fid, delimiter='\t', quoting=csv.QUOTE_NONE)
13
14 allReads = []
15 currentRead = None
16
17 stmap = {'P':'-','D':'+'}
18
19 for row in reader:
20 assert len(row) == 12
21 chr = row[0]
22 pos = row[1]
23 seq = row[2]
24 id = int(row[3])
25 strand = stmap[row[4]]
26 mismatch = row[5]
27 repeat = row[6]
28 length = row[7]
29 deletion = row[8]
30
31 prob = row[9]
32 calibratedProb = row[10]
33 chastity = row[11]
34
35 prob = [ord(elem)-50 for elem in prob]
36 calibratedProb = [ord(elem)-64 for elem in calibratedProb]
37 chastity = [ord(elem)+10 for elem in chastity]
38
39 currentRead = Read(chr,pos,seq,strand,prob,calibratedProb,chastity)
40 allReads.append(currentRead)
41
42 cPickle.dump(allReads,pickle_fid)
43
44 if __name__ == '__main__':
45 assert len(sys.argv) >= 3
46 solexaFile = sys.argv[1]
47 pickleFile = sys.argv[2]
48 assert os.path.exists(solexaFile)
49 assert not os.path.exists(pickleFile)
50
51 solexa_fid = open(solexaFile)
52 pickle_fid = open(pickleFile,'w+')
53 parse(solexa_fid,pickle_fid)