+ fixed quality conversion: According to new specs quality of prb is `ascii value...
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 31 Jul 2008 21:15:48 +0000 (21:15 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 31 Jul 2008 21:15:48 +0000 (21:15 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@10276 e1793c9e-67f9-0310-80fc-b846ff1f7b36

scripts/PipelineHeuristic.py
tools/run_specific_scripts/transcriptome_analysis/compile_dataset.py

index 93d6265..d4b92a8 100644 (file)
@@ -268,7 +268,7 @@ class PipelineHeuristic:
          exons[1,0]     = effective_len
          est            = unb_seq
          original_est   = seq
-         quality        = map(lambda x:ord(x)-50,prb)
+         quality        = map(lambda x:ord(x)-64,prb)
 
          #pdb.set_trace()
 
@@ -398,7 +398,7 @@ class PipelineHeuristic:
       strand   = location['strand']
       original_est = location['seq']
       quality      = location['prb']
-      quality        = map(lambda x:ord(x)-50,quality)
+      quality        = map(lambda x:ord(x)-64,quality)
       #cal_prb  = location['cal_prb']
       
       original_est = original_est.lower()
index 415405d..338fe20 100644 (file)
@@ -74,7 +74,7 @@ class DatasetGenerator:
       cPickle.dump(self.dataset.keys(),open(dataset_keys_fn,'w+'),protocol=2)
 
 
-   def parse_map_file(self,dataset,map_file,first_map):
+   def parse_map_file(self,dataset,map_file):
       strand_map = ['-','+']
       instance_counter = 0
 
@@ -141,30 +141,26 @@ class DatasetGenerator:
 
          # Take into account that the results of the second run do not contain
          # the original reads anymore.
-         if first_map:
-            original_read = read_seq
-         else:
-            original_read = create_bracket_seq(dna_seq[ds_offset:ds_offset+self.read_size],read_seq)
+         #if first_map:
+         #   original_read = read_seq
+         #else:
+         #   original_read = create_bracket_seq(dna_seq[ds_offset:ds_offset+self.read_size],read_seq)
 
          # in order to save some space we use a signed char to store the
          # qualities. Each quality element can range as follows: -128 <= elem <= 127
-         prb      = array.array('b',[0]*self.read_size)
-         #cal_prb  = array.array('b',[0]*self.read_size)
-         #chastity = array.array('b',[0]*self.read_size)
+         prb = array.array('b',map(lambda x: ord(x)-64,_prb))
 
-         # convert the ascii-encoded quality scores
-         for idx in range(self.read_size):
-            prb[idx]       = ord(_prb[idx])-50
-            #cal_prb[idx]   = ord(_cal_prb[idx])-64
-            #chastity[idx]  = ord(_chastity[idx])+10
+         #pdb.set_trace()
 
          # add instance to set
          currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
-         currentQualities = (prb)
+         currentQualities = [prb]
 
          # as one single read can have several vmatch matches we store all
          # these matches under the unique id of the read
-         dataset.setdefault(id, []).append((currentSeqInfo,original_read,currentQualities))
+
+         #dataset.setdefault(id, []).append((currentSeqInfo,original_read,currentQualities))
+         dataset.setdefault(id, []).append((currentSeqInfo,read_seq,currentQualities))
 
          instance_counter += 1
 
@@ -178,7 +174,7 @@ class DatasetGenerator:
       # usually we have two files to parse:
       # the map file from the second run and a subset of the map file from the
       # first run
-      dataset = self.parse_map_file(dataset,self.map_file,True)
-      dataset = self.parse_map_file(dataset,self.map_2nd_file,False)
+      dataset = self.parse_map_file(dataset,self.map_file)
+      dataset = self.parse_map_file(dataset,self.map_2nd_file)
 
       self.dataset = dataset