7cc028c864a1b045c529134006109595f5638a50
[qpalma.git] / tools / run_specific_scripts / transcriptome_analysis / compile_dataset.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import array
5 import cPickle
6 import os
7 import pdb
8 import random
9 import sys
10
11 import qpalma
12 import qpalma.tools
13
14 from qpalma.parsers import *
15 from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper,get_flat_file_size,reverse_complement,unbracket_seq,create_bracket_seq,reconstruct_dna_seq
16
17 import QPalmaConfiguration as Conf
18
19 #
20 # This script takes as input the map.vm map_2nd.vm files and generates QPalma
21 # dataset pickle files
22 #
23
24
25 class DatasetGenerator:
26 """
27 This class wraps the dataset generation for the matches of the second vmatch
28 run and the heuristic-filtered matches of the first run.
29
30 An example in our dataset consists of:
31
32 - information on the original read:
33 * id
34 * nucleotide sequence
35 * quality vectors (at the moment: prb and chastity)
36
37 - information on the target dna sequence:
38 * chromosome
39 * strand
40 * fragment start
41 * fragment end positions
42
43 this information is stored in tuples which are stored then in a dictionary
44 indexed by the reads unique id.
45
46 CAUTION: The dictionary stores a list of the above defined tuples under each
47 'id'. This is because one read can be found several times on the genome.
48 """
49
50 def __init__(self,map_file,map_2nd_file):
51 assert os.path.exists(map_file), 'Error: Can not find map file'
52 print map_2nd_file
53 assert os.path.exists(map_2nd_file), 'Error: Can not find map_2nd file'
54 self.map_file = map_file
55 self.map_2nd_file = map_2nd_file
56
57 self.dataset = []
58
59 self.read_size = 38
60 #self.prb_offset = 50
61 self.prb_offset = 64
62
63 self.half_window_size = 1500
64
65
66 def saveAs(self,dataset_file):
67 dataset_fn = '%s.pickle'% dataset_file
68 dataset_keys_fn = '%s.keys.pickle'%dataset_file
69
70 print dataset_fn, dataset_keys_fn
71
72 assert not os.path.exists(dataset_fn), 'The data_file already exists!'
73 assert not os.path.exists(dataset_keys_fn), 'The data_keys file already exists!'
74
75 #pdb.set_trace()
76 # saving new dataset and single keys as well
77 cPickle.dump(self.dataset,open(dataset_fn,'w+'),protocol=2)
78 cPickle.dump(self.dataset.keys(),open(dataset_keys_fn,'w+'),protocol=2)
79
80
81 def parse_map_file(self,dataset,map_file,first_round):
82 instance_counter = 0
83
84 g_dir = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
85 acc_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
86 don_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
87
88 g_fmt = 'chr%d.dna.flat'
89 s_fmt = 'contig_%d%s'
90
91 num_chromo = 6
92
93 accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
94 seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
95
96 not_consistent_ctr = 0
97
98 for line in open(map_file):
99 if instance_counter > 0 and instance_counter % 5000 == 0:
100 print 'processed %d examples' % instance_counter
101
102 if instance_counter == 10000:
103 print 'Not consistent ctr: %d' % not_consistent_ctr
104 # break
105
106 slist = line.split()
107
108 id = int(slist[0])
109 chromo = int(slist[1])
110 pos = int(slist[2])
111
112 # for the time being we only consider chromosomes 1-5
113 if not chromo in range(1,6):
114 continue
115
116 # convert D/P strand info to +/-
117 if slist[3] == 'D':
118 strand = '+'
119 else:
120 strand = '-'
121
122 # QPalma uses lowercase characters
123 bracket_seq = slist[4].lower()
124 read_seq = unbracket_seq(bracket_seq)
125 reconstructed_dna_seq = reconstruct_dna_seq(bracket_seq)
126
127 if strand == '-' and first_round:
128 read_seq = reverse_complement(read_seq)
129 #reconstructed_dna_seq = reverse_complement(reconstructed_dna_seq)
130
131 assert not '[' in read_seq and not ']' in read_seq
132
133 # we use an area of +/- `self.half_window_size` nucleotides around the seed position
134 if pos > self.half_window_size+1:
135 us_offset = self.half_window_size
136 else:
137 us_offset = pos - 1
138
139 if pos+self.half_window_size < seqInfo.chromo_sizes[chromo]:
140 ds_offset = self.half_window_size
141 else:
142 ds_offset = seqInfo.chromo_sizes[chromo]-pos-1
143
144 genomicSeq_start = pos - us_offset
145 genomicSeq_stop = pos + ds_offset
146
147 dna_seq = seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,True)
148
149 #if strand == '-':
150 # dna_part = dna_seq[us_offset-len(read_seq):us_offset]
151 #else:
152 # dna_part = dna_seq[us_offset:us_offset+len(read_seq)]
153
154 #if reconstructed_dna_seq != dna_part:
155 # not_consistent_ctr += 1
156 # pdb.set_trace()
157
158 ## check whether everything is ok
159 # try:
160 # currentDNASeq, currentAcc, currentDon = seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop)
161 # except:
162 # problem_ctr += 1
163 # continue
164
165 # In order to save some space we use a signed char to store the
166 # qualities. Each quality element can range as follows: -128 <= elem <= 127
167
168 prb = array.array('b',map(lambda x: ord(x)-self.prb_offset,slist[5]))
169
170 # add instance to set
171 currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
172 currentQualities = [prb]
173
174 # As one single read can have several vmatch matches we store all
175 # these matches under the unique id of the read
176 dataset.setdefault(id, []).append((currentSeqInfo,read_seq,currentQualities))
177
178 instance_counter += 1
179
180 print 'Total examples processed: %d' % instance_counter
181 print 'Not consistent ctr: %d' % not_consistent_ctr
182 return dataset
183
184
185 def compile_dataset(self):
186 dataset = {}
187
188 # usually we have two files to parse:
189 # the map file from the second run and a subset of the map file from the
190 # first run
191
192 dataset = self.parse_map_file(dataset,self.map_file,True)
193 dataset = self.parse_map_file(dataset,self.map_2nd_file,False)
194
195 self.dataset = dataset