+ fixed alignment index calculation for negative strand
[qpalma.git] / qpalma / DatasetUtils.py
1 # This program is free software; you can redistribute it and/or modify
2 # it under the terms of the GNU General Public License as published by
3 # the Free Software Foundation; either version 2 of the License, or
4 # (at your option) any later version.
5 #
6 # Written (W) 2008 Fabio De Bona
7 # Copyright (C) 2008 Max-Planck-Society
8
9 import array
10 import cPickle
11 import os
12 import os.path
13 import pdb
14
15 from sequence_utils import SeqSpliceInfo,DataAccessWrapper,get_flat_file_size,unbracket_seq,create_bracket_seq,reconstruct_dna_seq
16
17 jp = os.path.join
18
19 illumina_ga_range = (-5,40)
20 #roche_454_range =
21
22
23 def generateTrainingDataset(settings):
24 """
25 This function creates a training dataset.
26 """
27 dataset = []
28
29 saveData('training',dataset,settings)
30
31
32 def addToDataset(map_file,dataset,settings):
33 assert os.path.exists(map_file), 'Error: Can not find map file'
34
35 prb_offset = settings['prb_offset']
36
37 # This tuple specifies an interval for valid Illumina Genome Analyzer quality values
38 if settings['platform'] == 'IGA':
39 quality_interval = illumina_ga_range
40
41 # this means that around each vmatch hit we cut out a window of 1500 bases
42 # each up-/downstream.
43 half_window_size = settings['half_window_size']
44
45 instance_counter = 0
46
47 accessWrapper = DataAccessWrapper(settings)
48 seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
49
50 for line in open(map_file):
51 line = line.strip()
52 if line.startswith('#') or line == '':
53 continue
54
55 if instance_counter > 0 and instance_counter % 5000 == 0:
56 print 'processed %d examples' % instance_counter
57
58 slist = line.split()
59
60 id = int(slist[0])
61 chromo = int(slist[1])
62 pos = int(slist[2])
63
64 # for the time being we only consider chromosomes 1-5
65 if not chromo in settings['allowed_fragments']:
66 continue
67
68 # convert D/P strand info to +/-
69 strand = ['-','+'][slist[3] == 'D']
70
71 # QPalma uses lowercase characters
72 bracket_seq = slist[4].lower()
73 read_seq = unbracket_seq(bracket_seq)
74
75 # in prediction we do not have any bracketed reads anymore
76 assert not '[' in read_seq and not ']' in read_seq
77
78 # we use an area of +/- `self.half_window_size` nucleotides around the seed position
79 if pos > half_window_size+1:
80 us_offset = half_window_size
81 else:
82 us_offset = pos - 1
83
84 if pos+half_window_size < seqInfo.getFragmentSize(chromo):
85 ds_offset = half_window_size
86 else:
87 ds_offset = seqInfo.getFragmentSize(chromo)-pos-1
88
89 genomicSeq_start = pos - us_offset
90 genomicSeq_stop = pos + ds_offset
91
92 dna_seq = seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,True)
93
94 # In order to save some space we use a signed char to store the
95 # qualities. Each quality element can range as follows: -128 <= elem <= 127
96
97 q_values = map(lambda x: ord(x)-prb_offset,slist[5])
98
99 if settings['perform_checks']:
100 for entry in q_values:
101 assert quality_interval[0] <= entry <= quality_interval[1], 'Error: Your read has invalid quality values: %d' % entry
102
103 prb = array.array('b',q_values)
104
105 currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
106 currentQualities = [prb]
107
108 # As one single read can have several vmatch matches we store all these
109 # matches under the unique id of the read
110 dataset.setdefault(id, []).append((currentSeqInfo,read_seq,currentQualities))
111
112 instance_counter += 1
113
114 print 'Total examples processed: %d' % instance_counter
115
116 return dataset
117
118
119 def generatePredictionDataset(settings):
120 """
121 This function is responsible for the prediction dataset generation for the matches of
122 the second vmatch run and the heuristic-filtered matches of the first run.
123
124 An example in our dataset consists of:
125
126 - information on the original read:
127 * id
128 * nucleotide sequence
129 * quality vectors (at the moment: prb and chastity)
130
131 - information on the target dna sequence:
132 * chromosome
133 * strand
134 * fragment start
135 * fragment end positions
136
137 this information is stored in tuples which are stored then in a dictionary
138 indexed by the reads unique id.
139
140 CAUTION: The dictionary stores a list of the above defined tuples under each
141 'id'. This is because one read can be found several times on the genome.
142 """
143
144 dataset = {}
145
146 map_file = settings['spliced_reads_fn']
147 dataset = addToDataset(map_file,dataset,settings)
148
149 map_file = jp(settings['approximation_dir'],'map.vm.spliced')
150 dataset = addToDataset(map_file,dataset,settings)
151
152 saveData('prediction',dataset,settings)
153
154
155 def saveData(prefix,dataset,settings):
156 """
157 """
158
159 if prefix == 'prediction':
160 dataset_fn = settings['prediction_dataset_fn']
161 dataset_keys_fn = settings['prediction_dataset_keys_fn']
162 elif prefix == 'training':
163 dataset_fn = settings['training_dataset_fn']
164 dataset_keys_fn = settings['training_dataset_keys_fn']
165 else:
166 assert False
167
168 #assert not os.path.exists(dataset_fn), 'The data file already exists!'
169 #assert not os.path.exists(dataset_keys_fn), 'The data keys file already exists!'
170
171 # saving new dataset and single keys as well
172 cPickle.dump(dataset,open(dataset_fn,'w+'),protocol=2)
173 cPickle.dump(dataset.keys(),open(dataset_keys_fn,'w+'),protocol=2)