338fe20421ef8becbdbdf5eb78bbb245fa1e4582
[qpalma.git] / tools / run_specific_scripts / transcriptome_analysis / compile_dataset.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import array
5 import cPickle
6 import os
7 import pdb
8 import random
9 import sys
10
11 import qpalma
12 import qpalma.tools
13
14 from qpalma.parsers import *
15 from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper,get_flat_file_size,reverse_complement,unbracket_seq,create_bracket_seq
16
17 import QPalmaConfiguration as Conf
18
19 #
20 # This script takes as input the map.vm map_2nd.vm files and generates QPalma
21 # dataset pickle files
22 #
23
24
25 class DatasetGenerator:
26 """
27 This class wraps the dataset generation for the matches of the second vmatch
28 run and the heuristic-filtered matches of the first run.
29
30 An example in our dataset consists of:
31
32 - information on the original read:
33 * id
34 * nucleotide sequence
35 * quality vectors (at the moment: prb and chastity)
36
37 - information on the target dna sequence:
38 * chromosome
39 * strand
40 * fragment start
41 * fragment end positions
42
43 this information is stored in tuples which are stored then in a dictionary
44 indexed by the reads unique id.
45
46 CAUTION: The dictionary stores a list of the above defined tuples under each
47 'id'. This is because one read can be found several times on the genome.
48 """
49
50 def __init__(self,map_file,map_2nd_file):
51 assert os.path.exists(map_file), 'Error: Can not find map file'
52 print map_2nd_file
53 assert os.path.exists(map_2nd_file), 'Error: Can not find map_2nd file'
54 self.map_file = map_file
55 self.map_2nd_file = map_2nd_file
56
57 self.dataset = []
58
59 self.read_size = Conf.read_size
60
61
62 def saveAs(self,dataset_file):
63 dataset_fn = '%s.pickle'% dataset_file
64 dataset_keys_fn = '%s.keys.pickle'%dataset_file
65
66 print dataset_fn
67 print dataset_keys_fn
68
69 assert not os.path.exists(dataset_fn), 'The data_file already exists!'
70 assert not os.path.exists(dataset_keys_fn), 'The data_keys file already exists!'
71
72 # saving new dataset and single keys as well
73 cPickle.dump(self.dataset,open(dataset_fn,'w+'),protocol=2)
74 cPickle.dump(self.dataset.keys(),open(dataset_keys_fn,'w+'),protocol=2)
75
76
77 def parse_map_file(self,dataset,map_file):
78 strand_map = ['-','+']
79 instance_counter = 0
80
81 g_dir = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
82 acc_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
83 don_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
84
85 g_fmt = 'chr%d.dna.flat'
86 s_fmt = 'contig_%d%s'
87
88 num_chromo = 6
89
90 accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
91 seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
92
93 for line in open(map_file):
94 if instance_counter > 0 and instance_counter % 5000 == 0:
95 print 'processed %d examples' % instance_counter
96 #if instance_counter == 10000:
97 # break
98
99 slist = line.split()
100
101 id = int(slist[0])
102 chromo = int(slist[1])
103 pos = int(slist[2])
104
105 # convert D/P strand info to +/-
106 strand = slist[3]
107 strand = strand_map[strand == 'D']
108
109 read_seq = slist[4]
110
111 # QPalma uses lowercase characters
112 read_seq = read_seq.lower()
113 read_seq = unbracket_seq(read_seq)
114
115 # fetch the prb quality vector
116 _prb = slist[5]
117
118 # for the time being we only consider chromosomes 1-5
119 if not chromo in range(1,6):
120 continue
121
122 if strand == '-':
123 pos = seqInfo.chromo_sizes[chromo]-pos-self.read_size
124 read_seq = reverse_complement(read_seq)
125
126 # we use an area of +/- 1500 nucleotides around the seed position
127 if pos > 1501:
128 ds_offset = 1500
129 else:
130 ds_offset = pos-1
131
132 us_offset = 1500
133
134 genomicSeq_start = pos - ds_offset
135 genomicSeq_stop = pos + us_offset
136
137 #try:
138 dna_seq = seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,True)
139 #except:
140 # pdb.set_trace()
141
142 # Take into account that the results of the second run do not contain
143 # the original reads anymore.
144 #if first_map:
145 # original_read = read_seq
146 #else:
147 # original_read = create_bracket_seq(dna_seq[ds_offset:ds_offset+self.read_size],read_seq)
148
149 # in order to save some space we use a signed char to store the
150 # qualities. Each quality element can range as follows: -128 <= elem <= 127
151 prb = array.array('b',map(lambda x: ord(x)-64,_prb))
152
153 #pdb.set_trace()
154
155 # add instance to set
156 currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
157 currentQualities = [prb]
158
159 # as one single read can have several vmatch matches we store all
160 # these matches under the unique id of the read
161
162 #dataset.setdefault(id, []).append((currentSeqInfo,original_read,currentQualities))
163 dataset.setdefault(id, []).append((currentSeqInfo,read_seq,currentQualities))
164
165 instance_counter += 1
166
167 print 'Total examples processed: %d' % instance_counter
168 return dataset
169
170
171 def compile_dataset(self):
172 dataset = {}
173
174 # usually we have two files to parse:
175 # the map file from the second run and a subset of the map file from the
176 # first run
177 dataset = self.parse_map_file(dataset,self.map_file)
178 dataset = self.parse_map_file(dataset,self.map_2nd_file)
179
180 self.dataset = dataset