b98de772eeef8b95447ce54c4b8f5e05dc66d528
[qpalma.git] / qpalma / DatasetUtils.py
1 # This program is free software; you can redistribute it and/or modify
2 # it under the terms of the GNU General Public License as published by
3 # the Free Software Foundation; either version 2 of the License, or
4 # (at your option) any later version.
5 #
6 # Written (W) 2008 Fabio De Bona
7 # Copyright (C) 2008 Max-Planck-Society
8
9 import array
10 import cPickle
11 import os
12 import pdb
13
14 from sequence_utils import SeqSpliceInfo,DataAccessWrapper,get_flat_file_size,reverse_complement,unbracket_seq,create_bracket_seq,reconstruct_dna_seq
15
16 illumina_ga_range = (-5,40)
17 #roche_454_range =
18
19
20 def generateTrainingDataset(settings):
21 """
22 This function creates a training dataset.
23 """
24 dataset = []
25
26 saveData('training',dataset,settings)
27
28
29 def generatePredictionDataset(settings):
30 """
31 This function is responsible for the prediction dataset generation for the matches of
32 the second vmatch run and the heuristic-filtered matches of the first run.
33
34 An example in our dataset consists of:
35
36 - information on the original read:
37 * id
38 * nucleotide sequence
39 * quality vectors (at the moment: prb and chastity)
40
41 - information on the target dna sequence:
42 * chromosome
43 * strand
44 * fragment start
45 * fragment end positions
46
47 this information is stored in tuples which are stored then in a dictionary
48 indexed by the reads unique id.
49
50 CAUTION: The dictionary stores a list of the above defined tuples under each
51 'id'. This is because one read can be found several times on the genome.
52 """
53
54 #map_file = settings['map_file']
55 map_file = jp(result_dir = self.global_settings['approximation_dir'],'map.vm')
56 assert os.path.exists(map_file), 'Error: Can not find map file'
57
58 dataset = {}
59
60 prb_offset = 64
61
62 # This tuple specifies an interval for valid Illumina Genome Analyzer quality values
63 if settings['platform'] == 'IGA':
64 quality_interval = illumina_ga_range
65
66 # this means that around each vmatch hit we cut out a window of 1500 bases
67 # each up-/downstream.
68 half_window_size = settings['half_window_size']
69
70 instance_counter = 0
71
72 accessWrapper = DataAccessWrapper(settings)
73 seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
74
75 for line in open(map_file):
76 if instance_counter > 0 and instance_counter % 5000 == 0:
77 print 'processed %d examples' % instance_counter
78
79 slist = line.split()
80
81 id = int(slist[0])
82 chromo = int(slist[1])
83 pos = int(slist[2])
84
85 # for the time being we only consider chromosomes 1-5
86 if not chromo in settings['allowed_fragments']:
87 continue
88
89 # convert D/P strand info to +/-
90 strand = ['-','+'][slist[3] == 'D']
91
92 # QPalma uses lowercase characters
93 bracket_seq = slist[4].lower()
94 read_seq = unbracket_seq(bracket_seq)
95
96 # in prediction we do not have any bracketed reads anymore
97 assert not '[' in read_seq and not ']' in read_seq
98
99 # we use an area of +/- `self.half_window_size` nucleotides around the seed position
100 if pos > self.half_window_size+1:
101 us_offset = self.half_window_size
102 else:
103 us_offset = pos - 1
104
105 if pos+self.half_window_size < seqInfo.chromo_sizes[chromo]:
106 ds_offset = self.half_window_size
107 else:
108 ds_offset = seqInfo.chromo_sizes[chromo]-pos-1
109
110 genomicSeq_start = pos - us_offset
111 genomicSeq_stop = pos + ds_offset
112
113 dna_seq = seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,True)
114
115 # In order to save some space we use a signed char to store the
116 # qualities. Each quality element can range as follows: -128 <= elem <= 127
117
118 q_values = map(lambda x: ord(x)-self.prb_offset,slist[5])
119
120 if settings['perform_checks']:
121 for entry in q_values:
122 assert quality_interval[0] <= entry <= quality_interval[1], 'Error: Your read has invalid quality values: %d' % entry
123
124 prb = array.array('b',q_values)
125
126 currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
127 currentQualities = [prb]
128
129 # As one single read can have several vmatch matches we store all these
130 # matches under the unique id of the read
131 dataset.setdefault(id, []).append((currentSeqInfo,read_seq,currentQualities))
132
133 instance_counter += 1
134
135 print 'Total examples processed: %d' % instance_counter
136
137 saveData('prediction',dataset,settings)
138
139
140 def saveData(prefix,dataset,settings):
141 """
142 """
143 ddir = settings['dataset_dir']
144 dataset_fn = jp(ddir,'%s_data.pickle'%prefix)
145 dataset_keys_fn = jp(ddir,'%s_data.keys.pickle'%prefix)
146
147 assert not os.path.exists(dataset_fn), 'The data_file already exists!'
148 assert not os.path.exists(dataset_keys_fn), 'The data_keys file already exists!'
149
150 # saving new dataset and single keys as well
151 cPickle.dump(dataset,open(dataset_fn,'w+'),protocol=2)
152 cPickle.dump(dataset.keys(),open(dataset_keys_fn,'w+'),protocol=2)