+ extended Lookup object to support negative strand
[qpalma.git] / scripts / Lookup.py
index 52986b7..a3ccdd6 100644 (file)
@@ -14,7 +14,7 @@ from Genefinding import *
 from genome_utils import load_genomic
 
 # only for checking purposes
-#from compile_dataset import  get_seq_and_scores
+from qpalma.sequence_utils import  get_seq_and_scores
 
 
 class Lookup:
@@ -30,7 +30,7 @@ class Lookup:
       self.max_size = 40000000
 
       self.chromosomes = range(1,6)
-      self.strands = ['+']
+      self.strands = ['-']
 
       self.genomicSequencesP   = [0]*(len(self.chromosomes)+1)
       self.acceptorScoresP     = [0]*(len(self.chromosomes)+1)
@@ -40,7 +40,8 @@ class Lookup:
       self.acceptorScoresN     = [0]*(len(self.chromosomes)+1)
       self.donorScoresN        = [0]*(len(self.chromosomes)+1)
      
-      for chrId in self.chromosomes:
+      #for chrId in self.chromosomes:
+      for chrId in range(4,6):
          for strandId in self.strands:
             print (chrId,strandId)
             self.prefetch_seq_and_scores(chrId,strandId)
@@ -67,11 +68,48 @@ class Lookup:
       else:
          pdb.set_trace()
 
-
       return currentSequence.lower(), currentAcceptorScores, currentDonorScores
 
 
-   def prefetch_seq_and_scores(self,chr,strand):
+   def prefetch_seq_and_scores(self,chromo,strand):
+      """
+      This function expects an interval, chromosome and strand information and
+      returns then the genomic sequence of this interval and the associated scores.
+      """
+
+      fn = 'chr%d.dna.flat' % chromo
+      filename = os.path.join(self.dna_flat_files,fn)
+      cmd =  'wc -c %s | cut -f1 -d \' \'' % filename
+      import subprocess
+      obj = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
+      out,err = obj.communicate()
+      if err != '':
+         print 'Error occurred while trying to obtain file size'
+      size = int(out)
+
+      genomicSeq_start  = 205
+      genomicSeq_stop   = size-205
+      genomicSeq,currentAcc,currentDon = get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,self.dna_flat_files)
+      genomicSeq = genomicSeq.lower()
+
+      U = 'u'*205
+      INF_F = [-inf]*205
+
+      if strand == '+':
+         self.genomicSequencesP[chromo] = U + genomicSeq + U
+         self.acceptorScoresP[chromo]   = INF_F + currentAcc + INF_F
+         self.donorScoresP[chromo]      = INF_F + currentDon + INF_F
+
+      elif strand == '-':
+         self.genomicSequencesN[chromo] = U + genomicSeq + U
+         self.acceptorScoresN[chromo]   = INF_F + currentAcc + INF_F
+         self.donorScoresN[chromo]      = INF_F + currentDon + INF_F
+
+      else:
+         assert False
+
+
+   def _prefetch_seq_and_scores(self,chr,strand):
       """
       This function expects an interval, chromosome and strand information and
       returns then the genomic sequence of this interval and the associated scores.
@@ -147,10 +185,10 @@ if __name__ == '__main__':
       start = random.randint(40,10000)
       stop  = start + random.randint(10,100) 
 
-      seq,acc,don    =  lt1.get_seq_and_scores(chro,'+',start,stop,'')
-      _seq,_acc,_don = get_seq_and_scores(chro,'+',start,stop,dna_flat_files)
+      seq,acc,don    =  lt1.get_seq_and_scores(chro,'-',start,stop,'')
+      _seq,_acc,_don = get_seq_and_scores(chro,'-',start,stop,dna_flat_files)
 
-      assert _seq == seq
+      assert _seq == seq, pdb.set_trace()
       assert _acc == acc
       assert _don == don
 
@@ -161,5 +199,5 @@ if __name__ == '__main__':
       #assert _acc == acc
       #assert _don == don
 
-
    pdb.set_trace()
+