+ added proper treatment of reads overlapping into first/last 250 nucleotides of
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Mon, 23 Jun 2008 08:43:31 +0000 (08:43 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Mon, 23 Jun 2008 08:43:31 +0000 (08:43 +0000)
a chromosome

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@9708 e1793c9e-67f9-0310-80fc-b846ff1f7b36

qpalma/sequence_utils.py

index f9043ec..693d7df 100644 (file)
@@ -69,12 +69,12 @@ def reverse_complement(seq):
    return ret_val
 
 
-def unbracket_est(est):
+def unbracket_seq(seq):
    """
    This function takes a read in bracket notation and restores the read sequence from it.
    I.e.
 
-   est = cgt[ac][tg]aa
+   seq = cgt[ac][tg]aa
 
    leads to
 
@@ -84,21 +84,21 @@ def unbracket_est(est):
    entry is the base from the dna.
    """
 
-   new_est = ''
+   new_seq = ''
    e = 0
 
    while True:
-      if e >= len(est):
+      if e >= len(seq):
          break
 
-      if est[e] == '[':
-         new_est += est[e+2]
+      if seq[e] == '[':
+         new_seq += seq[e+2]
          e += 4
       else:
-         new_est += est[e]
+         new_seq += seq[e]
          e += 1
 
-   return "".join(new_est).lower()
+   return "".join(new_seq).lower()
 
 
 def create_bracket_seq(dna_seq,read_seq):
@@ -199,6 +199,7 @@ def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_file
 
    length = len(genomicSeq)
 
+   # indices for positive strand
    if strand == '+':
       currentAcc = currentAcc[:length]
       currentDon = currentDon+[-inf]*(length-len(currentDon))
@@ -206,11 +207,10 @@ def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_file
       ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g' ]
       gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
 
-   # build reverse complement if on negative strand
+   # take care of different indexation if on negative strand
    if strand == '-':
       currentAcc = currentAcc[:length]
       currentAcc = currentAcc+[-inf]*(length-len(currentAcc))
-
       currentDon = currentDon+[-inf]*(length-len(currentDon))
 
       ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and p<len(genomicSeq) and genomicSeq[p-1]=='a' and genomicSeq[p]=='g']
@@ -220,6 +220,14 @@ def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_file
       assert ag_tuple_pos == [p for p,e in enumerate(currentAcc) if e != -inf and p > 1], pdb.set_trace()
       assert gt_tuple_pos == [p for p,e in enumerate(currentDon) if e != -inf and p > 0], pdb.set_trace()
       assert len(currentAcc) == len(currentDon), pdb.set_trace()
+   else
+      for pos in ag_tuple_pos:
+         if currentAcc[pos] == -inf:
+            currentAcc[pos] = 0.01
+
+      for pos in gt_tuple_pos:
+         if currentDon[pos] == -inf:
+            currentDon[pos] = 0.01
 
    return genomicSeq, currentAcc, currentDon