+ changed dataset representation
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Mon, 12 May 2008 20:58:54 +0000 (20:58 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Mon, 12 May 2008 20:58:54 +0000 (20:58 +0000)
+ fixed datatype bug in the numpy exons array

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@8980 e1793c9e-67f9-0310-80fc-b846ff1f7b36

scripts/compile_dataset.py

index 1d24c0e..30cb0a4 100644 (file)
@@ -38,37 +38,46 @@ class DatasetGenerator:
       self.training_set = []
       self.testing_set  = []
 
+      print 'parsing filtered reads..'
+      self.all_filtered_reads = parse_filtered_reads(self.filtered_reads)
+      print 'found %d filtered reads' % len(self.all_filtered_reads)
+
 
-   def saveAs(dataset_file):
+   def saveAs(self,dataset_file):
       assert not os.path.exists(dataset_file), 'The data_file already exists!'
 
+      all_keys = self.training_set.keys()
+      random.shuffle(all_keys)
+      training_keys = all_keys[0:10000]
+
       # saving new datasets
       cPickle.dump(self.training_set,open('%s.train.pickle'%dataset_file,'w+'),protocol=2)
-      cPickle.dump(self.testing_set,open('%s.test.pickle'%dataset_file,'w+'),protocol=2)
+      #cPickle.dump(training_keys,open('%s.train_keys.pickle'%dataset_file,'w+'),protocol=2)
+      #cPickle.dump(self.testing_set,open('%s.test.pickle'%dataset_file,'w+'),protocol=2)
 
 
    def compile_training_set(self):
-      joinp = os.path.join
-
-      print 'parsing filtered reads..'
-      all_filtered_reads = parse_filtered_reads(self.filtered_reads)
-      print 'found %d filtered reads' % len(all_filtered_reads)
-
       # this stores the new dataset
-      SeqInfo        = []
-      Exons          = []
-      OriginalEsts   = []
-      Qualities      = []
+      #SeqInfo        = []
+      #Exons          = []
+      #OriginalEsts   = []
+      #Qualities      = []
+
+      dataset = {}
 
       # Iterate over all remapped reads in order to generate for each read a
       # training / prediction example
       instance_counter = 1
       skipped_ctr = 0
 
-      for id,filteredRead in all_filtered_reads.items():
+      for id,filteredRead in self.all_filtered_reads.items():
          if instance_counter % 1001 == 0:
             print 'processed %d examples' % instance_counter
 
+         # training set consists only of spliced reads
+         if not id < 1000000300000:
+            continue 
+
          if filteredRead['strand'] != '+':
             skipped_ctr += 1
             continue
@@ -90,26 +99,40 @@ class DatasetGenerator:
          cal_prb        = filteredRead['cal_prb']
          chastity       = filteredRead['chastity']
 
+         currentExons = zeros((2,2),dtype=numpy.int)
+         currentExons[0,0] = filteredRead['p_start']
+         currentExons[0,1] = filteredRead['exon_stop']
+         currentExons[1,0] = filteredRead['exon_start']
+         currentExons[1,1] = filteredRead['p_stop']
+         #Exons.append(currentExons)
+
          # add instance to set
-         SeqInfo.append((id,chromo,strand,genomicSeq_start,genomicSeq_stop))
-         OriginalEsts.append(original_est)
-         Qualities.append( (quality,cal_prb,chastity) )
+         currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
+         #SeqInfo.append(currentSeqInfo)
+         #OriginalEsts.append(original_est)
+         currentQualities = (quality,cal_prb,chastity)
+         #Qualities.append(currentQualities)
+
+         dataset[id] = (currentSeqInfo,currentExons,original_est,currentQualities)
 
          instance_counter += 1
 
-      print 'Full dataset has size %d' % len(SeqInfo)
+      print 'Full dataset has size %d' % len(dataset)
       print 'Skipped %d reads' % skipped_ctr
 
-      self.training_set = [SeqInfo, OriginalEsts, Qualities]
+      #self.training_set = [SeqInfo, Exons, OriginalEsts, Qualities]
+      self.training_set = dataset
       
 
    def compile_testing_set(self):
 
       strand_map = ['-','+']
 
-      SeqInfo = []
-      OriginalEsts = []
-      Qualities = []
+      #SeqInfo = []
+      #OriginalEsts = []
+      #Qualities = []
+
+      dataset = {}
 
       instance_counter = 1
 
@@ -128,22 +151,30 @@ class DatasetGenerator:
          genomicSeq_start = pos - 1500
          genomicSeq_stop  = pos + 1500
 
-         original_est = slist[11]
+         # fetch missing information from original reads
+         filteredRead = self.all_filtered_reads[id]
+         original_est = filteredRead['seq']
          original_est = original_est.lower()
 
-         prb      = [ord(elem)-50 for elem in slist[8]]
-         cal_prb  = [ord(elem)-64 for elem in slist[9]]
-         chastity = [ord(elem)+10 for elem in slist[10]]
+         original_est = filteredRead['seq']
+         prb     = filteredRead['prb']
+         cal_prb = filteredRead['cal_prb']
+         chastity = filteredRead['chastity']
 
          # add instance to set
-         SeqInfo.append((id,chromo,strand,genomicSeq_start,genomicSeq_stop))
-         OriginalEsts.append(original_est)
-         Qualities.append( (prb,cal_prb,chastity) )
+         currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
+         #SeqInfo.append(currentSeqInfo)
+         #OriginalEsts.append(original_est)
+         currentQualities = (prb,cal_prb,chastity)
+         #Qualities.append(currentQualities)
+
+         dataset[id] = (currentSeqInfo,original_est,currentQualities)
 
          instance_counter += 1
 
       # store the full set
-      self.testing_set = [SeqInfo, OriginalEsts, Qualities]
+      #self.testing_set = [SeqInfo, OriginalEsts, Qualities]
+      self.testing_set = dataset
 
 
 def compile_dataset_direct(filtered_reads,dataset_file):