+ extended Lookup object to support negative strand
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Wed, 4 Jun 2008 17:49:08 +0000 (17:49 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Wed, 4 Jun 2008 17:49:08 +0000 (17:49 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@9395 e1793c9e-67f9-0310-80fc-b846ff1f7b36

qpalma/sequence_utils.py
scripts/Lookup.py

index d6864a8..5e00cee 100644 (file)
@@ -166,8 +166,6 @@ def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_file
       assert gt_tuple_pos == [p for p,e in enumerate(currentDon) if e != -inf and p > 0], pdb.set_trace()
       assert len(currentAcc) == len(currentDon)
 
-      pdb.set_trace()
-
       return genomicSeq, currentAcc, currentDon
 
    # build reverse complement if on negative strand
@@ -183,15 +181,15 @@ def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_file
          print 'Error occurred while trying to obtain file size'
       end = int(out)
 
-      intervalBegin = genomicSeq_start#-100
-      intervalEnd    = genomicSeq_stop#+100
+      intervalBegin = genomicSeq_start-100
+      intervalEnd    = genomicSeq_stop+100
 
       total_size = end
       currentAcc, currentDon = getSpliceScores(chr,strand,intervalBegin,intervalEnd,total_size)
 
-      #currentAcc = currentAcc[100:-98]
+      currentAcc = currentAcc[100:-98]
       currentAcc = currentAcc[1:]
-      #currentDon = currentDon[100:-100]
+      currentDon = currentDon[100:-100]
 
       length = len(genomicSeq)
       currentAcc = currentAcc[:length]
@@ -199,7 +197,7 @@ def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_file
 
       currentDon = currentDon+[-inf]*(length-len(currentDon))
 
-      ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g']
+      ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and p<len(genomicSeq) and genomicSeq[p-1]=='a' and genomicSeq[p]=='g']
       gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
 
       assert ag_tuple_pos == [p for p,e in enumerate(currentAcc) if e != -inf and p > 1], pdb.set_trace()
@@ -207,3 +205,20 @@ def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_file
       assert len(currentAcc) == len(currentDon), pdb.set_trace()
 
       return genomicSeq, currentAcc, currentDon
+
+
+def checks():
+   start  = 1001
+   stop   = start+1234
+   dna_flat_files    =  '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+
+   for chromo in range(1,6):
+         p_seq,_p_acc,_p_don = get_seq_and_scores(chromo,'+',start,stop,dna_flat_files)
+         n_seq,n_acc,n_don = get_seq_and_scores(chromo,'-',start,stop,dna_flat_files)
+
+         print p_seq == reverse_complement(n_seq)
+
+         
+
+if __name__ == '__main__':
+   checks()
index 52986b7..a3ccdd6 100644 (file)
@@ -14,7 +14,7 @@ from Genefinding import *
 from genome_utils import load_genomic
 
 # only for checking purposes
-#from compile_dataset import  get_seq_and_scores
+from qpalma.sequence_utils import  get_seq_and_scores
 
 
 class Lookup:
@@ -30,7 +30,7 @@ class Lookup:
       self.max_size = 40000000
 
       self.chromosomes = range(1,6)
-      self.strands = ['+']
+      self.strands = ['-']
 
       self.genomicSequencesP   = [0]*(len(self.chromosomes)+1)
       self.acceptorScoresP     = [0]*(len(self.chromosomes)+1)
@@ -40,7 +40,8 @@ class Lookup:
       self.acceptorScoresN     = [0]*(len(self.chromosomes)+1)
       self.donorScoresN        = [0]*(len(self.chromosomes)+1)
      
-      for chrId in self.chromosomes:
+      #for chrId in self.chromosomes:
+      for chrId in range(4,6):
          for strandId in self.strands:
             print (chrId,strandId)
             self.prefetch_seq_and_scores(chrId,strandId)
@@ -67,11 +68,48 @@ class Lookup:
       else:
          pdb.set_trace()
 
-
       return currentSequence.lower(), currentAcceptorScores, currentDonorScores
 
 
-   def prefetch_seq_and_scores(self,chr,strand):
+   def prefetch_seq_and_scores(self,chromo,strand):
+      """
+      This function expects an interval, chromosome and strand information and
+      returns then the genomic sequence of this interval and the associated scores.
+      """
+
+      fn = 'chr%d.dna.flat' % chromo
+      filename = os.path.join(self.dna_flat_files,fn)
+      cmd =  'wc -c %s | cut -f1 -d \' \'' % filename
+      import subprocess
+      obj = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
+      out,err = obj.communicate()
+      if err != '':
+         print 'Error occurred while trying to obtain file size'
+      size = int(out)
+
+      genomicSeq_start  = 205
+      genomicSeq_stop   = size-205
+      genomicSeq,currentAcc,currentDon = get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,self.dna_flat_files)
+      genomicSeq = genomicSeq.lower()
+
+      U = 'u'*205
+      INF_F = [-inf]*205
+
+      if strand == '+':
+         self.genomicSequencesP[chromo] = U + genomicSeq + U
+         self.acceptorScoresP[chromo]   = INF_F + currentAcc + INF_F
+         self.donorScoresP[chromo]      = INF_F + currentDon + INF_F
+
+      elif strand == '-':
+         self.genomicSequencesN[chromo] = U + genomicSeq + U
+         self.acceptorScoresN[chromo]   = INF_F + currentAcc + INF_F
+         self.donorScoresN[chromo]      = INF_F + currentDon + INF_F
+
+      else:
+         assert False
+
+
+   def _prefetch_seq_and_scores(self,chr,strand):
       """
       This function expects an interval, chromosome and strand information and
       returns then the genomic sequence of this interval and the associated scores.
@@ -147,10 +185,10 @@ if __name__ == '__main__':
       start = random.randint(40,10000)
       stop  = start + random.randint(10,100) 
 
-      seq,acc,don    =  lt1.get_seq_and_scores(chro,'+',start,stop,'')
-      _seq,_acc,_don = get_seq_and_scores(chro,'+',start,stop,dna_flat_files)
+      seq,acc,don    =  lt1.get_seq_and_scores(chro,'-',start,stop,'')
+      _seq,_acc,_don = get_seq_and_scores(chro,'-',start,stop,dna_flat_files)
 
-      assert _seq == seq
+      assert _seq == seq, pdb.set_trace()
       assert _acc == acc
       assert _don == don
 
@@ -161,5 +199,5 @@ if __name__ == '__main__':
       #assert _acc == acc
       #assert _don == don
 
-
    pdb.set_trace()
+