+ optimized dataset generation/storage
[qpalma.git] / tools / run_specific_scripts / transcriptome_analysis / compile_dataset.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import sys
5 import array
6 import random
7 import os
8 import re
9 import pdb
10 import cPickle
11
12 import qpalma
13 import qpalma.tools
14
15 from qpalma.parsers import *
16 from qpalma.sequence_utils import *
17
18 import qpalma.Configuration as Conf
19
20 #
21 # This script takes as input the map.vm map_2nd.vm files and generates QPalma
22 # dataset pickle files
23 #
24
25
26 class DatasetGenerator:
27
28 def __init__(self,map_file,map_2nd_file):
29 assert os.path.exists(map_file), 'Error: Can not find map file'
30 assert os.path.exists(map_2nd_file), 'Error: Can not find map_2nd file'
31 self.map_file = map_file
32 self.map_2nd_file = map_2nd_file
33
34 self.dataset = []
35
36 self.read_size = 38
37
38
39 def saveAs(self,dataset_file):
40 dataset_fn = '%s.pickle'% dataset_file
41 dataset_keys_fn = '%s.keys.pickle'%dataset_file
42
43 assert not os.path.exists(dataset_fn), 'The data_file already exists!'
44 assert not os.path.exists(dataset_keys_fn), 'The data_keys file already exists!'
45
46 # saving new dataset and single keys as well
47 cPickle.dump(self.dataset,open(dataset_fn,'w+'),protocol=2)
48 cPickle.dump(self.dataset.keys(),open(dataset_keys_fn,'w+'),protocol=2)
49
50
51 def parse_map_file(self,dataset,map_file,first_map):
52 """
53 """
54
55 strand_map = ['-','+']
56 instance_counter = 0
57
58 for line in open(map_file):
59 if instance_counter > 0 and instance_counter % 50000 == 0:
60 print 'processed %d examples' % instance_counter
61
62 slist = line.split()
63
64 id = int(slist[0])
65 chromo = int(slist[1])
66 pos = int(slist[2])
67
68 # convert D/P strand info to +/-
69 strand = slist[3]
70 strand = strand_map[strand == 'D']
71
72 if first_map:
73 read_seq = slist[7]
74 else:
75 read_seq = slist[11]
76
77 # QPalma uses lowercase characters
78 read_seq = read_seq.lower()
79
80 # fetch the three quality vectors
81 _prb = slist[8]
82 _cal_prb = slist[9]
83 _chastity = slist[10]
84
85 # for the time being we only consider chromosomes 1-5
86 if not chromo in range(1,6):
87 continue
88
89 # we use an area of +/- 1500 nucleotides around the seed position
90 if pos > 1501:
91 ds_offset = 1500
92 else:
93 ds_offset = pos-1
94
95 us_offset = 1500
96
97 genomicSeq_start = pos - ds_offset
98 genomicSeq_stop = pos + us_offset
99
100 # fetch missing information from original reads
101 flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
102
103 try:
104 dna_seq = get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,flat_files,True)
105 except:
106 pdb.set_trace()
107
108 # Take into account that the results of the second run do not contain
109 # the original reads anymore.
110 if first_map:
111 original_est = read_seq
112 else:
113 original_est = create_bracket_seq(dna_seq[ds_offset:ds_offset+self.read_size],read_seq)
114
115 # in order to save some space we use a signed char to store the
116 # qualities. Each quality element can range as follows: -128 <= elem <= 127
117 prb = array.array('b',[0]*self.read_size)
118 #cal_prb = array.array('b',[0]*self.read_size)
119 chastity = array.array('b',[0]*self.read_size)
120
121 # convert the ascii-encoded quality scores
122 for idx in range(self.read_size):
123 prb[idx] = ord(_prb[idx])-50
124 #cal_prb[idx] = ord(_cal_prb[idx])-64
125 chastity[idx] = ord(_chastity[idx])+10
126
127 # add instance to set
128 currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
129 currentQualities = (prb,chastity)
130
131 # as one single read can have several vmatch matches we store all
132 # these matches under the unique id of the read
133 dataset.setdefault(id, []).append((currentSeqInfo,original_est,currentQualities))
134
135 instance_counter += 1
136
137
138 print 'Total examples processed: %d' % instance_counter
139 return dataset
140
141
142 def compile_dataset(self):
143 dataset = {}
144
145 # usually we have two files to parse:
146 # the map file from the second run and a subset of the map file from the
147 # first run
148 dataset = self.parse_map_file(dataset,self.map_file,True)
149 dataset = self.parse_map_file(dataset,self.map_2nd_file,False)
150
151 self.dataset = dataset