600e420c007fa86b17aa96bc5bf5cb29dfece0de
[qpalma.git] / tools / run_specific_scripts / transcriptome_analysis / compile_dataset.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import sys
5 import random
6 import os
7 import re
8 import pdb
9 import cPickle
10
11 #import numpy
12 #from numpy.matlib import mat,zeros,ones,inf
13
14 import qpalma
15 import qpalma.tools
16
17 from qpalma.parsers import *
18 from qpalma.sequence_utils import *
19
20 #from Genefinding import *
21 #from genome_utils import load_genomic
22
23 import qpalma.Configuration as Conf
24
25 def create_bracket_seq(dna_seq,read_seq):
26 """
27 This function takes a dna sequence and a read sequence and returns the
28 bracket format of the match/mismatches i.e.
29
30 dna : aaa
31 read: aac
32 is written in bracket notation: aa[ac]
33 """
34 assert len(dna_seq) == len(read_seq)
35 return "".join(map(lambda x,y: ['[%s%s]'%(x,y),x][x==y],dna_seq,read_seq))
36
37
38
39 class DatasetGenerator:
40 """
41
42 """
43
44 def __init__(self,map_file,map_2nd_file):
45 assert os.path.exists(map_file), 'Error: Can not find map file'
46 assert os.path.exists(map_2nd_file), 'Error: Can not find map_2nd file'
47 self.map_file = map_file
48 self.map_2nd_file = map_2nd_file
49
50 self.dataset = []
51
52
53 def saveAs(self,dataset_file):
54 dataset_fn = '%s.pickle'% dataset_file
55 dataset_keys_fn = '%s.keys.pickle'%dataset_file
56
57 assert not os.path.exists(dataset_fn), 'The data_file already exists!'
58 assert not os.path.exists(dataset_keys_fn), 'The data_keys file already exists!'
59
60 # saving new dataset and single keys as well
61 cPickle.dump(self.dataset,open(dataset_fn,'w+'),protocol=2)
62 cPickle.dump(self.dataset.keys(),open(dataset_keys_fn,'w+'),protocol=2)
63
64
65 def parse_map_file(self,dataset,map_file,first_map):
66 strand_map = ['-','+']
67 instance_counter = 1
68
69 for line in open(map_file):
70 if instance_counter % 1001 == 0:
71 print 'processed %d examples' % instance_counter
72
73 line = line.strip()
74 slist = line.split()
75 id = int(slist[0])
76 chromo = int(slist[1])
77 pos = int(slist[2])
78 strand = slist[3]
79 strand = strand_map[strand == 'D']
80
81 if first_map:
82 read_seq = slist[7]
83 else:
84 read_seq = slist[11]
85
86 _prb = slist[8]
87 _cal_prb = slist[9]
88 _chastity = slist[10]
89
90 if strand != '+':
91 continue
92
93 if not chromo in range(1,6):
94 continue
95
96 genomicSeq_start = pos - 1500
97 genomicSeq_stop = pos + 1500
98
99 # fetch missing information from original reads
100 flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
101 original_info = get_seq_and_scores(chromo,strand,pos,pos+35,flat_files)
102 dna_seq = original_info[0]
103
104 #filteredRead = self.all_filtered_reads[id]
105 #original_est = filteredRead['seq']
106 original_est = create_bracket_seq(dna_seq,read_seq)
107
108 for idx in range(read_size):
109 prb[idx] = ord(_prb[idx])-50
110 cal_prb[idx] = ord(_cal_prb[idx])-64
111 chastity[idx] = ord(_chastity[idx])+10
112
113 # add instance to set
114 currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
115 currentQualities = (prb,cal_prb,chastity)
116
117 # as one single read can have several vmatch matches we store all
118 # these matches under the unique id of the read
119 if dataset.has_key(id):
120 old_entry = dataset[id]
121 old_entry.append((currentSeqInfo,original_est,currentQualities))
122 dataset[id] = old_entry
123 else:
124 dataset[id] = [(currentSeqInfo,original_est,currentQualities)]
125
126 instance_counter += 1
127
128 print 'Total examples processed: %d' % instance_counter
129
130 return dataset
131
132
133 def compile_dataset(self):
134 dataset = {}
135
136 # usually we have two files to parse:
137 # the map file from the second run and a subset of the map file from the
138 # first run
139 dataset = self.parse_map_file(dataset,self.map_file,True)
140 dataset = self.parse_map_file(dataset,self.map_2nd_file,False)
141
142 # store the full set
143 self.testing_set = dataset