+ changed dataset compiler to take the two different map files into account
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Wed, 14 May 2008 08:13:55 +0000 (08:13 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Wed, 14 May 2008 08:13:55 +0000 (08:13 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@9002 e1793c9e-67f9-0310-80fc-b846ff1f7b36

scripts/compile_dataset.py

index 30cb0a4..88d386d 100644 (file)
@@ -28,12 +28,14 @@ class DatasetGenerator:
 
    """
    
-   def __init__(self,filtered_reads,map_file):
+   def __init__(self,filtered_reads,map_file,map_2nd_file):
       assert os.path.exists(filtered_reads), 'Error: Can not find reads file'
       self.filtered_reads = filtered_reads
 
       assert os.path.exists(map_file), 'Error: Can not find map file'
+      assert os.path.exists(map_2nd_file), 'Error: Can not find map_2nd file'
       self.map_file = map_file
+      self.map_2nd_file = map_2nd_file
 
       self.training_set = []
       self.testing_set  = []
@@ -51,18 +53,19 @@ class DatasetGenerator:
       training_keys = all_keys[0:10000]
 
       # saving new datasets
-      cPickle.dump(self.training_set,open('%s.train.pickle'%dataset_file,'w+'),protocol=2)
+      #cPickle.dump(self.training_set,open('%s.train.pickle'%dataset_file,'w+'),protocol=2)
       #cPickle.dump(training_keys,open('%s.train_keys.pickle'%dataset_file,'w+'),protocol=2)
-      #cPickle.dump(self.testing_set,open('%s.test.pickle'%dataset_file,'w+'),protocol=2)
+      cPickle.dump(self.testing_set,open('%s.test.pickle'%dataset_file,'w+'),protocol=2)
+
+      prediction_keys = [0]*len(self.testing_set.keys())
+      for pos,key in enumerate(self.testing_set.keys()):
+         prediction_keys[pos] = key
+
+      cPickle.dump(self.prediction_keys,open('%s.test_keys.pickle'%dataset_file,'w+'),protocol=2)
 
 
    def compile_training_set(self):
       # this stores the new dataset
-      #SeqInfo        = []
-      #Exons          = []
-      #OriginalEsts   = []
-      #Qualities      = []
-
       dataset = {}
 
       # Iterate over all remapped reads in order to generate for each read a
@@ -104,14 +107,10 @@ class DatasetGenerator:
          currentExons[0,1] = filteredRead['exon_stop']
          currentExons[1,0] = filteredRead['exon_start']
          currentExons[1,1] = filteredRead['p_stop']
-         #Exons.append(currentExons)
 
          # add instance to set
          currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
-         #SeqInfo.append(currentSeqInfo)
-         #OriginalEsts.append(original_est)
          currentQualities = (quality,cal_prb,chastity)
-         #Qualities.append(currentQualities)
 
          dataset[id] = (currentSeqInfo,currentExons,original_est,currentQualities)
 
@@ -120,30 +119,21 @@ class DatasetGenerator:
       print 'Full dataset has size %d' % len(dataset)
       print 'Skipped %d reads' % skipped_ctr
 
-      #self.training_set = [SeqInfo, Exons, OriginalEsts, Qualities]
       self.training_set = dataset
       
 
-   def compile_testing_set(self):
-
+   def parse_map_file(self,dataset,map_file)
       strand_map = ['-','+']
-
-      #SeqInfo = []
-      #OriginalEsts = []
-      #Qualities = []
-
-      dataset = {}
-
       instance_counter = 1
 
-      for line in open(self.map_file):
+      for line in open(map_file):
          if instance_counter % 1001 == 0:
             print 'processed %d examples' % instance_counter
 
          line  = line.strip()
          slist = line.split()
          id    = int(slist[0])
-         chromo      = int(slist[1])
+         chromo   = int(slist[1])
          pos      = int(slist[2])
          strand   = slist[3]
          strand   = strand_map[strand == 'D']
@@ -163,17 +153,26 @@ class DatasetGenerator:
 
          # add instance to set
          currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
-         #SeqInfo.append(currentSeqInfo)
-         #OriginalEsts.append(original_est)
          currentQualities = (prb,cal_prb,chastity)
-         #Qualities.append(currentQualities)
 
          dataset[id] = (currentSeqInfo,original_est,currentQualities)
 
          instance_counter += 1
 
+      return dataset
+
+
+   def compile_testing_set(self):
+
+      dataset = {}
+
+      # usually we have two files to parse:
+      # the map file from the second run and a subset of the map file from the
+      # first run
+      dataset = self.parse_map_file(dataset,self.map_file)
+      dataset = self.parse_map_file(dataset,self.map_2nd_file)
+
       # store the full set
-      #self.testing_set = [SeqInfo, OriginalEsts, Qualities]
       self.testing_set = dataset