+ modified Lookup table to work with negative strands
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 10 Jul 2008 09:24:52 +0000 (09:24 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 10 Jul 2008 09:24:52 +0000 (09:24 +0000)
+ added correct negative strand processing to the pipeline

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@9968 e1793c9e-67f9-0310-80fc-b846ff1f7b36

scripts/Lookup.py
scripts/PipelineHeuristic.py

index fd68fe9..f1bc39a 100644 (file)
@@ -9,7 +9,7 @@ import pdb
 import random
 
 from numpy.matlib import inf
-from qpalma.sequence_utils import get_seq_and_scores,get_flatfile_size
+from qpalma.sequence_utils import SeqSpliceInfo,get_flat_file_size
 
 
 class Lookup:
@@ -24,6 +24,10 @@ class Lookup:
       self.chromosomes = range(1,6)
       self.strands = ['+','-']
 
+      flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+      chromo_list = [1,2,3,4,5,8,9,10,11,12]
+      self.seqInfo = SeqSpliceInfo(flat_files,chromo_list)
+
       self.genomicSequencesP   = [0]*(len(self.chromosomes)+1)
       self.acceptorScoresP     = [0]*(len(self.chromosomes)+1)
       self.donorScoresP        = [0]*(len(self.chromosomes)+1)
@@ -33,7 +37,8 @@ class Lookup:
       self.donorScoresN        = [0]*(len(self.chromosomes)+1)
      
       #for chrId in self.chromosomes:
-      for chrId in range(1,6):
+      #for chrId in range(1,6):
+      for chrId in range(1,2):
          for strandId in self.strands:
             print (chrId,strandId)
             self.prefetch_seq_and_scores(chrId,strandId)
@@ -43,21 +48,11 @@ class Lookup:
       assert chromo in self.chromosomes
       assert strand in self.strands
 
-      if strand == '+':
-         currentSequence         = self.genomicSequencesP[chromo][start:stop+1]
-         currentDonorScores     = self.donorScoresP[chromo][start:stop+1]
-         currentAcceptorScores  = self.acceptorScoresP[chromo][start:stop+1]
-      elif strand == '-':
-         end = len(self.genomicSequencesN[chromo])
-         currentSequence         = self.genomicSequencesN[chromo][end-stop-1:end-start]
-         #length = len(currentSequence)
-         currentDonorScores     = self.donorScoresN[chromo][end-stop-1:end-start]
-         #currentDonorScores     += [-inf]*(length-len(currentDonorScores))
-         currentAcceptorScores  = self.acceptorScoresN[chromo][end-stop-1:end-start]
-      else:
-         pdb.set_trace()
+      currentSequence         = self.genomicSequencesP[chromo][start:stop]
+      currentDonorScores      = self.donorScoresP[chromo][start:stop]
+      currentAcceptorScores   = self.acceptorScoresP[chromo][start:stop]
 
-      return currentSequence.lower(), currentAcceptorScores, currentDonorScores
+      return currentSequence, currentAcceptorScores, currentDonorScores
 
 
    def prefetch_seq_and_scores(self,chromo,strand):
@@ -66,60 +61,30 @@ class Lookup:
       returns then the genomic sequence of this interval and the associated scores.
       """
 
-      fn = 'chr%d.dna.flat' % chromo
-      filename = os.path.join(self.dna_flat_files,fn)
-      size = get_flatfile_size(filename)
+      genomicSeq_start  = 0
 
-      #  we do not use the first/last 205 position as they are can not be
-      #  predicted due to the window size used for splice site prediction
-      genomicSeq_start  = 205
-      genomicSeq_stop   = size-205
-      genomicSeq,currentAcc,currentDon = get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,self.dna_flat_files)
-      genomicSeq = genomicSeq.lower()
+      if strand == '-':
+         genomicSeq_stop   = self.seqInfo.chromo_sizes[chromo+7]
+      else:
+         genomicSeq_stop   = self.seqInfo.chromo_sizes[chromo]
 
-      U = 'u'*205
-      INF_F = [-inf]*205
+      genomicSeq,currentAcc,currentDon = self.seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop)
+      genomicSeq = genomicSeq.lower()
 
       if strand == '+':
-         self.genomicSequencesP[chromo] = U + genomicSeq + U
-         self.acceptorScoresP[chromo]   = INF_F + currentAcc + INF_F
-         self.donorScoresP[chromo]      = INF_F + currentDon + INF_F
+         self.genomicSequencesP[chromo] = genomicSeq
+         self.acceptorScoresP[chromo]   = currentAcc
+         self.donorScoresP[chromo]      = currentDon
       elif strand == '-':
-         self.genomicSequencesN[chromo] = U + genomicSeq + U
-         self.acceptorScoresN[chromo]   = INF_F + currentAcc + INF_F
-         self.donorScoresN[chromo]      = INF_F + currentDon + INF_F
+         self.genomicSequencesN[chromo] = genomicSeq
+         self.acceptorScoresN[chromo]   = currentAcc
+         self.donorScoresN[chromo]      = currentDon
       else:
          assert False
 
 
-   def getSpliceScores(self,chr,strand,intervalBegin,intervalEnd):
-      """
-      Now we want to use interval_query to get the predicted splice scores trained
-      on the TAIR sequence and annotation.
-      """
-      
-      assert intervalEnd > intervalBegin
-
-      size = intervalEnd-intervalBegin
-
-      acc = size*[0.0]
-      don = size*[0.0]
-
-      interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
-      pos_size = new_intp()
-      intp_assign(pos_size,1)
-
-      # fetch acceptor scores
-      sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s'
-      acc = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename)
-
-      # fetch donor scores
-      sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s'
-      don = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename)
-
-      return acc, don
-
-
 if __name__ == '__main__':
    dna_flat_files    =  '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
    lt1 = Lookup(dna_flat_files)
+
+   #self.lt1.seqInfo.get_seq_and_scores
index ee17503..7f8dea2 100644 (file)
@@ -208,7 +208,6 @@ class PipelineHeuristic:
          if ctr % 1000 == 0:
             print ctr
 
-
          id       = location['id']
          chr      = location['chr']
          pos      = location['pos']
@@ -229,7 +228,6 @@ class PipelineHeuristic:
 
          strand = strand_map[strand]
 
-
          if not chr in range(1,6):
             continue
 
@@ -237,6 +235,7 @@ class PipelineHeuristic:
 
          # forgot to do this
          if strand == '-':
+            pos = self.lt1.seqInfo.chromo_sizes[chromo+7]-pos-self.read_size
             unb_seq = reverse_complement(unb_seq)
 
          effective_len = len(unb_seq)