+ changed check for intial setting of old_w. (matrix != None does not work in cvxopt)
[qpalma.git] / qpalma / Lookup.py
index f22ecd7..baa6419 100644 (file)
@@ -1,10 +1,17 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# Written (W) 2008 Fabio De Bona
+# Copyright (C) 2008 Max-Planck-Society
 
 import sys
 import os
+import pdb
+from numpy import inf
 
-from qpalma.sequence_utils import SeqSpliceInfo,get_flat_file_size,DataAccessWrapper
+from qpalma.sequence_utils import SeqSpliceInfo,get_flat_file_size,DataAccessWrapper,reverse_complement
 
 
 class LookupTable:
@@ -18,8 +25,6 @@ class LookupTable:
       accessWrapper = DataAccessWrapper(settings)
       self.seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
 
-      self.strands = ['+','-']
-   
       # take care that it also works say if we get a chromo_list [1,3,5]
       # those are mapped then to 0,1,2
       self.chromo_list = chromo_list
@@ -33,71 +38,74 @@ class LookupTable:
       self.acceptorScores     = {}
       self.donorScores        = {}
 
-      for strandId in self.strands:
-         self.genomicSequences[strandId] = [0]*(len(self.chromo_list))
-         self.acceptorScores[strandId]   = [0]*(len(self.chromo_list))
-         self.donorScores[strandId]      = [0]*(len(self.chromo_list))
+      self.genomicSequences = [0]*(len(self.chromo_list))
+      self.acceptorScoresPos   = [0]*(len(self.chromo_list))
+      self.donorScoresPos      = [0]*(len(self.chromo_list))
+      self.acceptorScoresNeg   = [0]*(len(self.chromo_list))
+      self.donorScoresNeg      = [0]*(len(self.chromo_list))
 
       for chrId in self.chromo_list:
-         for strandId in self.strands:
-            print (chrId,strandId)
-            self.prefetch_seq_and_scores(chrId,strandId)
+         self.prefetch_seq_and_scores(chrId)
 
  
    def get_seq_and_scores(self,chromo,strand,start,stop):
       assert chromo in self.chromo_list
-      assert strand in self.strands
-
       assert start <= stop
 
       chromo_idx = self.chromo_map[chromo]
 
-      start -= 1
-      stop  -= 1
-
-      currentSequence         = self.genomicSequences[strand][chromo_idx][start:stop]
 
-      if strand == '-':
-         start += 2
-         stop  += 2
+      if strand == '+':
+         currentSequence         = self.genomicSequences[chromo_idx][start:stop]
+         currentAcceptorScores   = self.acceptorScoresPos[chromo_idx][start:stop]
+         currentDonorScores      = self.donorScoresPos[chromo_idx][start:stop]
+      elif strand == '-':
+         currentSequence         = self.genomicSequences[chromo_idx][start:stop]
+         currentSequence = reverse_complement(currentSequence)
 
-      currentAcceptorScores   = self.acceptorScores[strand][chromo_idx][start:stop]
-      currentDonorScores      = self.donorScores[strand][chromo_idx][start:stop]
+         total_size = self.seqInfo.getFragmentSize(chromo)
+         _start = total_size - stop
+         _stop = total_size - start
+         currentAcceptorScores   = self.acceptorScoresNeg[chromo_idx][_start:_stop]
+         currentDonorScores      = self.donorScoresNeg[chromo_idx][_start:_stop]
 
-      if strand == '-':
-         currentSequence = currentSequence[::-1]
-         currentAcceptorScores.reverse()
-         currentDonorScores.reverse()
+         #gt_tuple_pos = [p for p,e in enumerate(currentSequence) if (p>0 and p<len(currentSequence)-1 and e=='g' ) and (currentSequence[p+1]=='t' or currentSequence[p+1]=='c')]
+         #ag_tuple_pos = [p for p,e in enumerate(currentSequence) if p>1 and currentSequence[p-1]=='a' and currentSequence[p]=='g']
+         #acc_pos = [p for p,e in enumerate(currentAcceptorScores) if e != -inf and p > 1]
+         #don_pos = [p for p,e in enumerate(currentDonorScores) if e != -inf and p > 0]
 
       return currentSequence, currentAcceptorScores, currentDonorScores
 
 
-   def prefetch_seq_and_scores(self,chromo,strand):
+   def prefetch_seq_and_scores(self,chromo):
       """
       This function expects an interval, chromosome and strand information and
       returns then the genomic sequence of this interval and the associated scores.
       """
       assert chromo in self.chromo_list
-      assert strand in self.strands
 
-      chromo_idx = self.chromo_map[chromo]
+      genomicSeq_start  = 0
+      genomicSeq_stop   = self.seqInfo.getFragmentSize(chromo)
 
-      genomicSeq_start  = 1
-      genomicSeq_stop   = self.seqInfo.chromo_sizes[chromo]-1
+      print 'lt total_size %d' % self.seqInfo.getFragmentSize(chromo)
 
-      print 'lt total_size %d' % self.seqInfo.chromo_sizes[chromo]
+      genomicSeq,currentAcc,currentDon = self.seqInfo.get_seq_and_scores(chromo,'+',genomicSeq_start,genomicSeq_stop)
+      genomicSeq = genomicSeq.lower()
 
-      if strand == '-':
-         genomicSeq_stop   = self.seqInfo.chromo_sizes[chromo]+1
+      chromo_idx = self.chromo_map[chromo]
 
-      genomicSeq,currentAcc,currentDon = self.seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,False,False)
-      genomicSeq = genomicSeq.lower()
+      self.genomicSequences[chromo_idx] = genomicSeq
+      self.acceptorScoresPos[chromo_idx]   = currentAcc
+      self.donorScoresPos[chromo_idx]      = currentDon
+
+      dummy,currentAcc,currentDon = self.seqInfo.get_seq_and_scores(chromo,'-',genomicSeq_start,genomicSeq_stop)
 
-      if strand == '-':
-         genomicSeq = genomicSeq[::-1]
-         currentAcc.reverse()
-         currentDon.reverse()
+      #genomicSeq = reverse_complement(self.genomicSequences[chromo_idx])
+      #gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if (p>0 and p<len(genomicSeq)-1 and e=='g' ) and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
+      #ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g']
+      #acc_pos = [p for p,e in enumerate(currentAcc) if e != -inf and p > 1]
+      #don_pos = [p for p,e in enumerate(currentDon) if e != -inf and p > 0]
+      #pdb.set_trace()
 
-      self.genomicSequences[strand][chromo_idx] = genomicSeq
-      self.acceptorScores[strand][chromo_idx]   = currentAcc
-      self.donorScores[strand][chromo_idx]      = currentDon
+      self.acceptorScoresNeg[chromo_idx]   = currentAcc
+      self.donorScoresNeg[chromo_idx]      = currentDon