+ extended Lookup object to support negative strand
[qpalma.git] / qpalma / sequence_utils.py
index d6864a8..5e00cee 100644 (file)
@@ -166,8 +166,6 @@ def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_file
       assert gt_tuple_pos == [p for p,e in enumerate(currentDon) if e != -inf and p > 0], pdb.set_trace()
       assert len(currentAcc) == len(currentDon)
 
-      pdb.set_trace()
-
       return genomicSeq, currentAcc, currentDon
 
    # build reverse complement if on negative strand
@@ -183,15 +181,15 @@ def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_file
          print 'Error occurred while trying to obtain file size'
       end = int(out)
 
-      intervalBegin = genomicSeq_start#-100
-      intervalEnd    = genomicSeq_stop#+100
+      intervalBegin = genomicSeq_start-100
+      intervalEnd    = genomicSeq_stop+100
 
       total_size = end
       currentAcc, currentDon = getSpliceScores(chr,strand,intervalBegin,intervalEnd,total_size)
 
-      #currentAcc = currentAcc[100:-98]
+      currentAcc = currentAcc[100:-98]
       currentAcc = currentAcc[1:]
-      #currentDon = currentDon[100:-100]
+      currentDon = currentDon[100:-100]
 
       length = len(genomicSeq)
       currentAcc = currentAcc[:length]
@@ -199,7 +197,7 @@ def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_file
 
       currentDon = currentDon+[-inf]*(length-len(currentDon))
 
-      ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g']
+      ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and p<len(genomicSeq) and genomicSeq[p-1]=='a' and genomicSeq[p]=='g']
       gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
 
       assert ag_tuple_pos == [p for p,e in enumerate(currentAcc) if e != -inf and p > 1], pdb.set_trace()
@@ -207,3 +205,20 @@ def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_file
       assert len(currentAcc) == len(currentDon), pdb.set_trace()
 
       return genomicSeq, currentAcc, currentDon
+
+
+def checks():
+   start  = 1001
+   stop   = start+1234
+   dna_flat_files    =  '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+
+   for chromo in range(1,6):
+         p_seq,_p_acc,_p_don = get_seq_and_scores(chromo,'+',start,stop,dna_flat_files)
+         n_seq,n_acc,n_don = get_seq_and_scores(chromo,'-',start,stop,dna_flat_files)
+
+         print p_seq == reverse_complement(n_seq)
+
+         
+
+if __name__ == '__main__':
+   checks()