"""
- def __init__(self,filtered_reads,map_file):
+ def __init__(self,filtered_reads,map_file,map_2nd_file):
assert os.path.exists(filtered_reads), 'Error: Can not find reads file'
self.filtered_reads = filtered_reads
assert os.path.exists(map_file), 'Error: Can not find map file'
+ assert os.path.exists(map_2nd_file), 'Error: Can not find map_2nd file'
self.map_file = map_file
+ self.map_2nd_file = map_2nd_file
self.training_set = []
self.testing_set = []
training_keys = all_keys[0:10000]
# saving new datasets
- cPickle.dump(self.training_set,open('%s.train.pickle'%dataset_file,'w+'),protocol=2)
+ #cPickle.dump(self.training_set,open('%s.train.pickle'%dataset_file,'w+'),protocol=2)
#cPickle.dump(training_keys,open('%s.train_keys.pickle'%dataset_file,'w+'),protocol=2)
- #cPickle.dump(self.testing_set,open('%s.test.pickle'%dataset_file,'w+'),protocol=2)
+ cPickle.dump(self.testing_set,open('%s.test.pickle'%dataset_file,'w+'),protocol=2)
+
+ prediction_keys = [0]*len(self.testing_set.keys())
+ for pos,key in enumerate(self.testing_set.keys()):
+ prediction_keys[pos] = key
+
+ cPickle.dump(self.prediction_keys,open('%s.test_keys.pickle'%dataset_file,'w+'),protocol=2)
def compile_training_set(self):
# this stores the new dataset
- #SeqInfo = []
- #Exons = []
- #OriginalEsts = []
- #Qualities = []
-
dataset = {}
# Iterate over all remapped reads in order to generate for each read a
currentExons[0,1] = filteredRead['exon_stop']
currentExons[1,0] = filteredRead['exon_start']
currentExons[1,1] = filteredRead['p_stop']
- #Exons.append(currentExons)
# add instance to set
currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
- #SeqInfo.append(currentSeqInfo)
- #OriginalEsts.append(original_est)
currentQualities = (quality,cal_prb,chastity)
- #Qualities.append(currentQualities)
dataset[id] = (currentSeqInfo,currentExons,original_est,currentQualities)
print 'Full dataset has size %d' % len(dataset)
print 'Skipped %d reads' % skipped_ctr
- #self.training_set = [SeqInfo, Exons, OriginalEsts, Qualities]
self.training_set = dataset
- def compile_testing_set(self):
-
+ def parse_map_file(self,dataset,map_file)
strand_map = ['-','+']
-
- #SeqInfo = []
- #OriginalEsts = []
- #Qualities = []
-
- dataset = {}
-
instance_counter = 1
- for line in open(self.map_file):
+ for line in open(map_file):
if instance_counter % 1001 == 0:
print 'processed %d examples' % instance_counter
line = line.strip()
slist = line.split()
id = int(slist[0])
- chromo = int(slist[1])
+ chromo = int(slist[1])
pos = int(slist[2])
strand = slist[3]
strand = strand_map[strand == 'D']
# add instance to set
currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
- #SeqInfo.append(currentSeqInfo)
- #OriginalEsts.append(original_est)
currentQualities = (prb,cal_prb,chastity)
- #Qualities.append(currentQualities)
dataset[id] = (currentSeqInfo,original_est,currentQualities)
instance_counter += 1
+ return dataset
+
+
+ def compile_testing_set(self):
+
+ dataset = {}
+
+ # usually we have two files to parse:
+ # the map file from the second run and a subset of the map file from the
+ # first run
+ dataset = self.parse_map_file(dataset,self.map_file)
+ dataset = self.parse_map_file(dataset,self.map_2nd_file)
+
# store the full set
- #self.testing_set = [SeqInfo, OriginalEsts, Qualities]
self.testing_set = dataset