+ some optimization
[qpalma.git] / tools / run_specific_scripts / transcriptome_analysis / compile_dataset.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import array
5 import cPickle
6 import os
7 import pdb
8 import random
9 import sys
10
11 import qpalma
12 import qpalma.tools
13
14 from qpalma.parsers import *
15 from qpalma.sequence_utils import *
16
17 import qpalma.Configuration as Conf
18
19 #
20 # This script takes as input the map.vm map_2nd.vm files and generates QPalma
21 # dataset pickle files
22 #
23
24
25 class DatasetGenerator:
26 """
27 This class wraps the dataset generation for the matches of the second vmatch
28 run and the heuristic-filtered matches of the first run.
29
30 An example in our dataset consists of:
31
32 - information on the original read:
33 * id
34 * nucleotide sequence
35 * quality vectors (at the moment: prb and chastity)
36
37 - information on the target dna sequence:
38 * chromosome
39 * strand
40 * fragment start
41 * fragment end positions
42
43 this information is stored in tuples which are stored then in a dictionary
44 indexed by the reads unique id.
45
46 CAUTION: The dictionary stores a list of the above defined tuples under each
47 'id'. This is because one read can be found several times on the genome.
48 """
49
50 def __init__(self,map_file,map_2nd_file):
51 assert os.path.exists(map_file), 'Error: Can not find map file'
52 assert os.path.exists(map_2nd_file), 'Error: Can not find map_2nd file'
53 self.map_file = map_file
54 self.map_2nd_file = map_2nd_file
55
56 self.dataset = []
57
58 #self.read_size = 38
59 self.read_size = 36
60
61
62 def saveAs(self,dataset_file):
63 dataset_fn = '%s.pickle'% dataset_file
64 dataset_keys_fn = '%s.keys.pickle'%dataset_file
65
66 print dataset_fn
67 print dataset_keys_fn
68
69 assert not os.path.exists(dataset_fn), 'The data_file already exists!'
70 assert not os.path.exists(dataset_keys_fn), 'The data_keys file already exists!'
71
72 # saving new dataset and single keys as well
73 cPickle.dump(self.dataset,open(dataset_fn,'w+'),protocol=2)
74 cPickle.dump(self.dataset.keys(),open(dataset_keys_fn,'w+'),protocol=2)
75
76
77 def parse_map_file(self,dataset,map_file,first_map):
78 strand_map = ['-','+']
79 instance_counter = 0
80
81 for line in open(map_file):
82 if instance_counter > 0 and instance_counter % 50000 == 0:
83 print 'processed %d examples' % instance_counter
84
85 #if instance_counter == 10:
86 # break
87
88 slist = line.split()
89
90 id = int(slist[0])
91 chromo = int(slist[1])
92 pos = int(slist[2])
93
94 # convert D/P strand info to +/-
95 strand = slist[3]
96 strand = strand_map[strand == 'D']
97
98 if first_map:
99 read_seq = slist[7]
100 else:
101 read_seq = slist[11]
102
103 # QPalma uses lowercase characters
104 read_seq = read_seq.lower()
105
106 # fetch the three quality vectors
107 _prb = slist[8]
108 _cal_prb = slist[9]
109 _chastity = slist[10]
110
111 # for the time being we only consider chromosomes 1-5
112 if not chromo in range(1,6):
113 continue
114
115 # we use an area of +/- 1500 nucleotides around the seed position
116 if pos > 1501:
117 ds_offset = 1500
118 else:
119 ds_offset = pos-1
120
121 us_offset = 1500
122
123 genomicSeq_start = pos - ds_offset
124 genomicSeq_stop = pos + us_offset
125
126 # fetch missing information from original reads
127 flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
128
129 try:
130 dna_seq = get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,flat_files,True)
131 except:
132 pdb.set_trace()
133
134 # Take into account that the results of the second run do not contain
135 # the original reads anymore.
136 if first_map:
137 original_read = read_seq
138 else:
139 original_read = create_bracket_seq(dna_seq[ds_offset:ds_offset+self.read_size],read_seq)
140
141 # in order to save some space we use a signed char to store the
142 # qualities. Each quality element can range as follows: -128 <= elem <= 127
143 prb = array.array('b',[0]*self.read_size)
144 #cal_prb = array.array('b',[0]*self.read_size)
145 chastity = array.array('b',[0]*self.read_size)
146
147 # convert the ascii-encoded quality scores
148 for idx in range(self.read_size):
149 prb[idx] = ord(_prb[idx])-50
150 #cal_prb[idx] = ord(_cal_prb[idx])-64
151 chastity[idx] = ord(_chastity[idx])+10
152
153 # add instance to set
154 currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
155 currentQualities = (prb,chastity)
156
157 # as one single read can have several vmatch matches we store all
158 # these matches under the unique id of the read
159 dataset.setdefault(id, []).append((currentSeqInfo,original_read,currentQualities))
160
161 instance_counter += 1
162
163 print 'Total examples processed: %d' % instance_counter
164 return dataset
165
166
167 def compile_dataset(self):
168 dataset = {}
169
170 # usually we have two files to parse:
171 # the map file from the second run and a subset of the map file from the
172 # first run
173 #dataset = self.parse_map_file(dataset,self.map_file,True)
174 dataset = self.parse_map_file(dataset,self.map_2nd_file,False)
175
176 self.dataset = dataset