index f2b3f90..4ea1cb9 100644 (file)
@@ -79,7 +79,7 @@ class PipelineHeuristic:

# when we look for alternative alignments with introns this value is the
# mean of intron size
-      self.intron_size  = 150
+      self.intron_size  = 250

@@ -178,7 +178,7 @@ class PipelineHeuristic:
if strand == '-':
continue

-            if ctr == 100:
+            if ctr == 1000:
break

#if pos > 10000000:
@@ -260,30 +260,50 @@ class PipelineHeuristic:
print 'True neg / false neg : %d %d'%(self.true_neg,self.false_neg)

-   def findHighestScoringSpliceSites(self,currentAcc,currentDon):
+   def findHighestScoringSpliceSites(self, currentAcc, currentDon, DNA, max_intron_size, read_size, splice_thresh):

-      acc   = []
-      for idx,score in enumerate(currentAcc):
-         if score > -inf:
-            acc.append((idx,score))
-             break
+      def signum(a):
+          if a>0:
+              return 1
+         elif a<0:
+              return -1
+          else:
+              return 0
+
+      proximal_acc   = []
+      for idx in xrange(max_intron_size, max_intron_size+read_size):
+          if currentAcc[idx]>= splice_thresh:
+            proximal_acc.append((idx,currentAcc[idx]))
+
+      proximal_acc.sort(lambda x,y: signum(x[1]-y[1]))
+      proximal_acc=proximal_acc[-2:]
+
+      distal_acc   = []
+      for idx in xrange(max_intron_size+read_size, len(currentAcc)):
+          if currentAcc[idx]>= splice_thresh:
+
+      #distal_acc.sort(lambda x,y: signum(x[1]-y[1]))
+      #distal_acc=distal_acc[-2:]

-      acc.sort(lambda x,y: x[0]-y[0])
-      acc=acc[-2:]
-
-      don   = []
-      for idx,score in enumerate(currentDon):
-         if score > -inf:
-            don.append((idx,score))
-             break

-      don.sort(lambda x,y: x[0]-y[0])
-      don=don[-2:]
+      proximal_don   = []
+      for idx in xrange(max_intron_size, max_intron_size+read_size):
+         if currentDon[idx] >= splice_thresh:
+            proximal_don.append((idx, currentDon[idx]))

-      return don,acc
+      proximal_don.sort(lambda x,y: signum(x[1]-y[1]))
+      proximal_don=proximal_don[-2:]

+      distal_don   = []
+      for idx in xrange(1, max_intron_size):
+         if currentDon[idx] >= splice_thresh:
+
+      #distal_don.sort(lambda x,y: signum(x[1]-y[1]))
+      #distal_don=distal_don[-2:]
+
+      return proximal_acc,proximal_don,distal_acc,distal_don

def calcAlternativeAlignments(self,location):
"""
@@ -293,6 +313,8 @@ class PipelineHeuristic:
"""

run = self.run
+      splice_thresh = 0.01
+      max_intron_size = 2000

id       = location['id']
chr      = location['chr']
@@ -305,8 +327,8 @@ class PipelineHeuristic:
est = unbracket_est(original_est)
effective_len = len(est)

-      genomicSeq_start  = pos
+      genomicSeq_start  = pos - max_intron_size
+      genomicSeq_stop   = pos + max_intron_size + len(est)

start = cpu()
currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
@@ -314,13 +336,12 @@ class PipelineHeuristic:
self.get_time += stop-start
dna   = currentDNASeq

-      alt_don,alt_acc = self.findHighestScoringSpliceSites(currentAcc,currentDon)
-
-      sort_acc=enumerate(currentAcc).sort(lambda x,y: x[1]-y[1])
-      (max_acc_pos,max_acc_score)=sort_acc[-1]
-
-      #print alt_don
-      #print alt_acc
+      proximal_acc,proximal_don,distal_acc,distal_don = self.findHighestScoringSpliceSites(currentAcc,currentDon, dna, max_intron_size, len(est), splice_thresh)
+
+      print proximal_acc
+      print proximal_don
+      print distal_acc
+      print distal_don

alternativeScores = []

@@ -333,13 +354,14 @@ class PipelineHeuristic:
# inlined

# compute dummy scores
-      IntronScore = calculatePlif(h, [math.fabs(max_acc_pos-30)])[0]
-      #print IntronScore
-      dummyAcceptorScore = calculatePlif(a, [max_acc_score])[0]
+      #IntronScore = calculatePlif(h, [math.fabs(max_acc_pos-30)])[0]
+      #dummyAcceptorScore = calculatePlif(a, [max_acc_score])[0]
+      IntronScore = calculatePlif(h, [self.intron_size])[0] - 0.5
+      dummyAcceptorScore = calculatePlif(a, [0.25])[0]
dummyDonorScore = calculatePlif(d, [0.25])[0]

_start = cpu()
-      for (don_pos,don_score) in alt_don:
+      for (don_pos,don_score) in proximal_don:
# remove mismatching positions in the second exon
original_est_cut=''

@@ -354,7 +376,8 @@ class PipelineHeuristic:
if dna_ptr < don_pos:
original_est_cut+=original_est[ptr:ptr+4]
else:
-                    original_est_cut+=estletter # EST letter
+                    #original_est_cut+=estletter # EST letter
+                    original_est_cut+=dnaletter # DNA letter
ptr+=4
else:
dnaletter=original_est[ptr]
@@ -480,10 +503,10 @@ if __name__ == '__main__':
jp = os.path.join

run_fname   = jp(dir,'run_object.pickle')
-   #data_fname = '/fml/ag-raetsch/share/projects/qpalma/solexa/current_data/map.vm_unspliced_flag'
+   #data_fname = '/fml/ag-raetsch/share/projects/qpalma/solexa/current_data/map.vm_unspliced_1k'

-   #data_fname  = '/fml/ag-raetsch/share/projects/qpalma/solexa/pipeline_data/map.vm_2k'
-   data_fname  = '/fml/ag-raetsch/share/projects/qpalma/solexa/pipeline_data/map.vm_100'
+   data_fname  = '/fml/ag-raetsch/share/projects/qpalma/solexa/pipeline_data/map.vm_2k'
+   #data_fname  = '/fml/ag-raetsch/share/projects/qpalma/solexa/pipeline_data/map.vm_100'

param_fname = jp(dir,'param_500.pickle')