index 438e525..4ea1cb9 100644 (file)
@@ -79,7 +79,7 @@ class PipelineHeuristic:

# when we look for alternative alignments with introns this value is the
# mean of intron size
-      self.intron_size  = 90
+      self.intron_size  = 250

@@ -178,7 +178,7 @@ class PipelineHeuristic:
if strand == '-':
continue

-            if ctr == 100:
+            if ctr == 1000:
break

#if pos > 10000000:
@@ -260,26 +260,50 @@ class PipelineHeuristic:
print 'True neg / false neg : %d %d'%(self.true_neg,self.false_neg)

-   def findHighestScoringSpliceSites(self,currentAcc,currentDon):
-      max_don  = -inf
-      don_pos  = []
-      for idx,score in enumerate(currentDon):
-         if score > -inf and idx > 1 and idx < self.read_size:
-            don_pos.append(idx)
+   def findHighestScoringSpliceSites(self, currentAcc, currentDon, DNA, max_intron_size, read_size, splice_thresh):

-         if len(don_pos) == 2:
-            break
+      def signum(a):
+          if a>0:
+              return 1
+         elif a<0:
+              return -1
+          else:
+              return 0

-      max_acc  = -inf
-      acc_pos  = []
-      for idx,score in enumerate(currentAcc):
-         if score > -inf and idx >= self.intron_size:
-            acc_pos = idx
-            #acc_pos.append(idx)
-            break
+      proximal_acc   = []
+      for idx in xrange(max_intron_size, max_intron_size+read_size):
+          if currentAcc[idx]>= splice_thresh:
+            proximal_acc.append((idx,currentAcc[idx]))

-      return don_pos,acc_pos
+      proximal_acc.sort(lambda x,y: signum(x[1]-y[1]))
+      proximal_acc=proximal_acc[-2:]

+      distal_acc   = []
+      for idx in xrange(max_intron_size+read_size, len(currentAcc)):
+          if currentAcc[idx]>= splice_thresh:
+            distal_acc.append((idx, currentAcc[idx], DNA[idx+1:idx+read_size]))
+
+      #distal_acc.sort(lambda x,y: signum(x[1]-y[1]))
+      #distal_acc=distal_acc[-2:]
+
+
+      proximal_don   = []
+      for idx in xrange(max_intron_size, max_intron_size+read_size):
+         if currentDon[idx] >= splice_thresh:
+            proximal_don.append((idx, currentDon[idx]))
+
+      proximal_don.sort(lambda x,y: signum(x[1]-y[1]))
+      proximal_don=proximal_don[-2:]
+
+      distal_don   = []
+      for idx in xrange(1, max_intron_size):
+         if currentDon[idx] >= splice_thresh:
+            distal_don.append((idx, currentDon[idx], DNA[idx-read_size:idx]))
+
+      #distal_don.sort(lambda x,y: signum(x[1]-y[1]))
+      #distal_don=distal_don[-2:]
+
+      return proximal_acc,proximal_don,distal_acc,distal_don

def calcAlternativeAlignments(self,location):
"""
@@ -289,43 +313,38 @@ class PipelineHeuristic:
"""

run = self.run
+      splice_thresh = 0.01
+      max_intron_size = 2000

id       = location['id']
chr      = location['chr']
pos      = location['pos']
strand   = location['strand']
-      seq      = location['seq']
-      #orig_seq = location['orig_seq']
-      prb      = location['prb']
+      original_est = location['seq']
+      quality      = location['prb']
cal_prb  = location['cal_prb']

-      orig_seq = self.original_reads[int(id)]
+      est = unbracket_est(original_est)
+      effective_len = len(est)

-      unb_seq = unbracket_est(seq)
-      effective_len = len(unb_seq)
-
-      genomicSeq_start  = pos
-      genomicSeq_stop   = pos+self.intron_size*2+self.read_size*2
+      genomicSeq_start  = pos - max_intron_size
+      genomicSeq_stop   = pos + max_intron_size + len(est)

start = cpu()
currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
stop = cpu()
self.get_time += stop-start
-
dna   = currentDNASeq

-      start = cpu()
-      alt_don_pos,acc_pos = self.findHighestScoringSpliceSites(currentAcc,currentDon)
-      stop = cpu()
-      self.splice_site_time = stop-start
-
+      proximal_acc,proximal_don,distal_acc,distal_don = self.findHighestScoringSpliceSites(currentAcc,currentDon, dna, max_intron_size, len(est), splice_thresh)
+
+      print proximal_acc
+      print proximal_don
+      print distal_acc
+      print distal_don
+
alternativeScores = []
-
-      exons          = zeros((2,2),dtype=numpy.int)
-      est            = unb_seq
-      original_est   = seq
-      quality        = prb
-
+
# inlined
h = self.h
d = self.d
@@ -333,61 +352,67 @@ class PipelineHeuristic:
mmatrix = self.mmatrix
qualityPlifs = self.qualityPlifs
# inlined
+
+      # compute dummy scores
+      #IntronScore = calculatePlif(h, [math.fabs(max_acc_pos-30)])[0]
+      #dummyAcceptorScore = calculatePlif(a, [max_acc_score])[0]
+      IntronScore = calculatePlif(h, [self.intron_size])[0] - 0.5
+      dummyAcceptorScore = calculatePlif(a, [0.25])[0]
+      dummyDonorScore = calculatePlif(d, [0.25])[0]

_start = cpu()
-      for don_pos in alt_don_pos:
-         start = cpu()
-
-         exons[0,0]     = 0
-         exons[0,1]     = don_pos
-         exons[1,0]     = acc_pos+1
-         exons[1,1]     = acc_pos+1+(self.read_size-don_pos)
-
-         _dna = dna[:int(exons[1,1])]
-         _dna = _dna[:exons[1,0]] + orig_seq[don_pos:]
-
-         _currentAcc = currentAcc[:int(exons[1,1])]
-         _currentAcc = [0.25]*len(_currentAcc)
-
-         _currentDon = currentDon[:int(exons[1,1])]
-         _currentDon = [0.25]*len(_currentDon)
-
-         currentVMatchAlignment = _dna, exons, est, original_est, quality,\
-         _currentAcc, _currentDon
-
-         stop = cpu()
-         self.array_stuff += stop - start
-
-         #alternativeScore = self.calcAlignmentScore(currentVMatchAlignment)
-         #alternativeScores.append(self.calcAlignmentScore(currentVMatchAlignment))
-
-         # Lets start calculation
-         dna, exons, est, original_est, quality, acc_supp, don_supp =\
-         currentVMatchAlignment
-
-         # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
-         trueSpliceAlign, trueWeightMatch, trueWeightQuality ,dna_calc =\
-         computeSpliceAlignWithQuality(dna, exons, est, original_est,\
-         quality, qualityPlifs, run)
-
-         stop = cpu()
-         self.computeSpliceAlignWithQualityTime += stop-start
-         start = cpu()
-
-         # Calculate the weights
-         trueWeightDon, trueWeightAcc, trueWeightIntron =\
-         computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
+      for (don_pos,don_score) in proximal_don:
+         # remove mismatching positions in the second exon
+         original_est_cut=''

-         stop = cpu()
-         self.computeSpliceWeightsTime += stop-start
+         est_ptr=0
+         dna_ptr=0
+         ptr=0
+         while ptr<len(original_est):
+
+            if original_est[ptr]=='[':
+                dnaletter=original_est[ptr+1]
+                estletter=original_est[ptr+2]
+                if dna_ptr < don_pos:
+                    original_est_cut+=original_est[ptr:ptr+4]
+                else:
+                    #original_est_cut+=estletter # EST letter
+                    original_est_cut+=dnaletter # DNA letter
+                ptr+=4
+            else:
+                dnaletter=original_est[ptr]
+                estletter=dnaletter
+
+                original_est_cut+=estletter # EST letter
+                ptr+=1

-         start = cpu()
+            if estletter=='-':
+                dna_ptr+=1
+            elif dnaletter=='-':
+                est_ptr+=1
+            else:
+                dna_ptr+=1
+                est_ptr+=1
+
+         assert(dna_ptr<=len(dna))
+         assert(est_ptr<=len(est))

-         trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
+         #print "Donor"
+         DonorScore = calculatePlif(d, [don_score])[0]
+         #print DonorScore,don_score,don_pos
+
+         score = computeSpliceAlignScoreWithQuality(original_est_cut, quality, qualityPlifs, run, self.currentPhi)
+         score += dummyAcceptorScore + IntronScore + DonorScore
+
+         #print 'diff %f,%f,%f' % ((trueWeight.T * self.currentPhi)[0,0] - score,(trueWeight.T * self.currentPhi)[0,0], score)
+         alternativeScores.append(score)

-         # Calculate w'phi(x,y) the total score of the alignment
-         alternativeScores.append((trueWeight.T * self.currentPhi)[0,0])
+      _stop = cpu()
+      self.alternativeScoresTime += _stop-_start

+      _start = cpu()
+      for (acc_pos,acc_score) in alt_acc:
+         # remove mismatching positions in the second exon
original_est_cut=''

est_ptr=0
@@ -398,16 +423,16 @@ class PipelineHeuristic:
if original_est[ptr]=='[':
dnaletter=original_est[ptr+1]
estletter=original_est[ptr+2]
-                if est_ptr<=exons[0,1]:
-                    original_est_cut.join(original_est[ptr:ptr+4])
+                if est_ptr>=acc_pos:
+                    original_est_cut+=original_est[ptr:ptr+4]
else:
-                    original_est_cut.join(estletter) # EST letter
+                    original_est_cut+=estletter # EST letter
ptr+=4
else:
dnaletter=original_est[ptr]
estletter=dnaletter

-                original_est_cut.join(estletter) # EST letter
+                original_est_cut+=estletter # EST letter
ptr+=1

if estletter=='-':
@@ -421,15 +446,21 @@ class PipelineHeuristic:
assert(dna_ptr<=len(dna))
assert(est_ptr<=len(est))

+         #print original_est,original_est_cut

-         # new score
-         score = computeSpliceAlignScoreWithQuality(original_est_cut, quality, qualityPlifs, run, self.currentPhi)
+         AcceptorScore = calculatePlif(d, [acc_score])[0]
+         #print "Acceptor"
+         #print AcceptorScore,acc_score,acc_pos

-         if score!=0.0:
-             print 'diff %f' % ((trueWeight.T * self.currentPhi)[0,0] - score)
-
-         stop = cpu()
-         self.DotProdTime += stop-start
+         #if acc_score<0.1:
+         #    print currentAcc[0:50]
+         #    print currentDon[0:50]
+
+         score = computeSpliceAlignScoreWithQuality(original_est_cut, quality, qualityPlifs, run, self.currentPhi)
+         score += AcceptorScore + IntronScore + dummyDonorScore
+
+         #print 'diff %f,%f,%f' % ((trueWeight.T * self.currentPhi)[0,0] - score,(trueWeight.T * self.currentPhi)[0,0], score)
+         alternativeScores.append(score)

_stop = cpu()
self.alternativeScoresTime += _stop-_start
@@ -472,10 +503,10 @@ if __name__ == '__main__':
jp = os.path.join

run_fname   = jp(dir,'run_object.pickle')
-   #data_fname = '/fml/ag-raetsch/share/projects/qpalma/solexa/current_data/map.vm_unspliced_flag'
+   #data_fname = '/fml/ag-raetsch/share/projects/qpalma/solexa/current_data/map.vm_unspliced_1k'

-   #data_fname  = '/fml/ag-raetsch/share/projects/qpalma/solexa/pipeline_data/map.vm_2k'
-   data_fname  = '/fml/ag-raetsch/share/projects/qpalma/solexa/pipeline_data/map.vm_100'
+   data_fname  = '/fml/ag-raetsch/share/projects/qpalma/solexa/pipeline_data/map.vm_2k'
+   #data_fname  = '/fml/ag-raetsch/share/projects/qpalma/solexa/pipeline_data/map.vm_100'

param_fname = jp(dir,'param_500.pickle')