+ changed dataset compilation to support new framework
[qpalma.git] / qpalma / DatasetUtils.py
1 # This program is free software; you can redistribute it and/or modify
2 # it under the terms of the GNU General Public License as published by
3 # the Free Software Foundation; either version 2 of the License, or
4 # (at your option) any later version.
5 #
6 # Written (W) 2008 Fabio De Bona
7 # Copyright (C) 2008 Max-Planck-Society
8
9 import array
10 import cPickle
11 import os
12 import pdb
13
14 #from parsers import *
15 from sequence_utils import SeqSpliceInfo,DataAccessWrapper,get_flat_file_size,reverse_complement,unbracket_seq,create_bracket_seq,reconstruct_dna_seq
16
17
18 def generateDataset(global_settings):
19 """
20 This function is responsible for the dataset generation for the matches of
21 the second vmatch run and the heuristic-filtered matches of the first run.
22
23 An example in our dataset consists of:
24
25 - information on the original read:
26 * id
27 * nucleotide sequence
28 * quality vectors (at the moment: prb and chastity)
29
30 - information on the target dna sequence:
31 * chromosome
32 * strand
33 * fragment start
34 * fragment end positions
35
36 this information is stored in tuples which are stored then in a dictionary
37 indexed by the reads unique id.
38
39 CAUTION: The dictionary stores a list of the above defined tuples under each
40 'id'. This is because one read can be found several times on the genome.
41 """
42
43 assert os.path.exists(map_file), 'Error: Can not find map file'
44 print map_2nd_file
45 assert os.path.exists(map_2nd_file), 'Error: Can not find map_2nd file'
46 self.map_file = map_file
47 self.map_2nd_file = map_2nd_file
48
49 dataset = {}
50
51 prb_offset = 64
52
53 # this means that around each vmatch hit we cut out a window of 1500 bases
54 # each up-/downstream.
55 half_window_size = 1500
56
57 instance_counter = 0
58
59 accessWrapper = DataAccessWrapper(settings)
60 seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
61
62 for line in open(map_file):
63 if instance_counter > 0 and instance_counter % 5000 == 0:
64 print 'processed %d examples' % instance_counter
65
66 slist = line.split()
67
68 id = int(slist[0])
69 chromo = int(slist[1])
70 pos = int(slist[2])
71
72 # for the time being we only consider chromosomes 1-5
73 if not chromo in range(1,6):
74 continue
75
76 # convert D/P strand info to +/-
77 strand = ['-','+'][slist[3] == 'D']
78
79 # QPalma uses lowercase characters
80 bracket_seq = slist[4].lower()
81 read_seq = unbracket_seq(bracket_seq)
82 reconstructed_dna_seq = reconstruct_dna_seq(bracket_seq)
83
84 if strand == '-' and first_round:
85 read_seq = reverse_complement(read_seq)
86 #reconstructed_dna_seq = reverse_complement(reconstructed_dna_seq)
87
88 assert not '[' in read_seq and not ']' in read_seq
89
90 # we use an area of +/- `self.half_window_size` nucleotides around the seed position
91 if pos > self.half_window_size+1:
92 us_offset = self.half_window_size
93 else:
94 us_offset = pos - 1
95
96 if pos+self.half_window_size < seqInfo.chromo_sizes[chromo]:
97 ds_offset = self.half_window_size
98 else:
99 ds_offset = seqInfo.chromo_sizes[chromo]-pos-1
100
101 genomicSeq_start = pos - us_offset
102 genomicSeq_stop = pos + ds_offset
103
104 dna_seq = seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,True)
105
106 #if strand == '-':
107 # dna_part = dna_seq[us_offset-len(read_seq):us_offset]
108 #else:
109 # dna_part = dna_seq[us_offset:us_offset+len(read_seq)]
110
111 ## check whether everything is ok
112 # try:
113 # currentDNASeq, currentAcc, currentDon = seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop)
114 # except:
115 # problem_ctr += 1
116 # continue
117
118 # In order to save some space we use a signed char to store the
119 # qualities. Each quality element can range as follows: -128 <= elem <= 127
120
121 prb = array.array('b',map(lambda x: ord(x)-self.prb_offset,slist[5]))
122
123 currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
124 currentQualities = [prb]
125
126 # As one single read can have several vmatch matches we store all these
127 # matches under the unique id of the read
128 dataset.setdefault(id, []).append((currentSeqInfo,read_seq,currentQualities))
129
130 instance_counter += 1
131
132 print 'Total examples processed: %d' % instance_counter
133
134 ddir = global_settings['dataset_dir']
135 dataset_fn = jp(ddir,'data.pickle')
136 dataset_keys_fn = jp(ddir,'data.keys.pickle')
137
138 assert not os.path.exists(dataset_fn), 'The data_file already exists!'
139 assert not os.path.exists(dataset_keys_fn), 'The data_keys file already exists!'
140
141 # saving new dataset and single keys as well
142 cPickle.dump(dataset,open(dataset_fn,'w+'),protocol=2)
143 cPickle.dump(dataset.keys(),open(dataset_keys_fn,'w+'),protocol=2)