+ minor changes
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Wed, 27 Feb 2008 14:59:55 +0000 (14:59 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Wed, 27 Feb 2008 14:59:55 +0000 (14:59 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@7970 e1793c9e-67f9-0310-80fc-b846ff1f7b36

scripts/Evaluation.py
scripts/compile_dataset.py
scripts/qpalma_main.py

index 0bc6bd5..96518bb 100644 (file)
@@ -3,11 +3,12 @@
 
 import cPickle
 import sys
+import pydb
+import pdb
 import os
 import os.path
 
 def evaluatePositions(eBegin,eEnd):
-
    eBegin_pos = [elem for elem in eBegin if elem == 0]
    eBegin_neg = [elem for elem in eBegin if elem != 0]
    eEnd_pos = [elem for elem in eEnd if elem == 0]
@@ -33,18 +34,18 @@ def evaluatePositions(eBegin,eEnd):
 
    return eBegin_pos,eBegin_neg,eEnd_pos,eEnd_neg,mean_eBegin_neg,mean_eEnd_neg
 
+
 def perform_prediction(current_dir,run_name):
    #cmd = 'echo ./doPrediction.sh %s | qsub -l h_vmem=1.0G -cwd -j y -N \"%s.log\"'%(current_dir,run_name)
    cmd = './doPrediction.sh %s 1>%s.out 2>%s.err' %(current_dir,run_name,run_name)
    os.system(cmd)
    #print cmd
 
+
 def forall_experiments(current_func,tl_dir):
    """
    Given the toplevel directoy this function calls for each subdir the
-   prediction function of each found qpalma training instance.
-
-   Once this is done we can collect the results
+   function given as first argument
    """
 
    dir_entries = os.listdir(tl_dir)
@@ -53,14 +54,12 @@ def forall_experiments(current_func,tl_dir):
 
    for current_dir in run_dirs:
       run_name = current_dir.split('/')[-1]
-      #print current_dir
-
       current_func(current_dir,run_name)
 
+
 def collect_prediction(current_dir,run_name):
    """
    Given the toplevel directoy this function calls for each subdir the
-
    """
 
    train_suffix   = '_allPredictions_TRAIN'
@@ -68,36 +67,53 @@ def collect_prediction(current_dir,run_name):
    jp = os.path.join
    
    filename =  jp(current_dir,run_name)+train_suffix
-   allTrainPredictions = cPickle.load(open(filename))
+   print 'Prediction on: %s' % filename
+   prediction_on(filename)
+
+   filename =  jp(current_dir,run_name)+test_suffix
+   print 'Prediction on: %s' % filename
+   prediction_on(filename)
+
+
+def prediction_on(filename):
+   allPredictions = cPickle.load(open(filename))
 
    exon1Begin     = []
    exon1End       = []
    exon2Begin     = []
    exon2End       = []
    allWrongExons  = []
-
-   for current_pred in allTrainPredictions:
-      e1_b_off = current_pred['e1_b_off']
-      e1_e_off = current_pred['e1_e_off']
-      e2_b_off = current_pred['e2_b_off']
-      e2_e_off = current_pred['e2_e_off']
-
-      if e1_b_off != None:
-         exon1Begin.append(e1_b_off)
-         exon1End.append(e1_e_off)
-         exon2Begin.append(e2_b_off)
-         exon2End.append(e2_e_off)
-      else:
-         pass
-         #allWrongExons.append((newExons,exons))
-
-      #if e1_b_off == 0 and e1_e_off == 0 and e2_b_off == 0 and e2_e_off == 0:
-      #   print >> logfile, 'example %d correct' % exampleIdx
-      #else:
-      #   print >> logfile, 'example %d wrong' % exampleIdx
-
-      e1Begin_pos,e1Begin_neg,e1End_pos,e1End_neg,mean_e1Begin_neg,mean_e1End_neg = evaluatePositions(exon1Begin,exon1End)
-      e2Begin_pos,e2Begin_neg,e2End_pos,e2End_neg,mean_e2Begin_neg,mean_e2End_neg = evaluatePositions(exon2Begin,exon2End)
+   allDoubleScores = []
+
+   for current_example_pred in allPredictions:
+      ambigous_match = False
+      if len(current_example_pred) > 1:
+         ambigous_match = True
+         example_scores = []
+
+      for current_pred in current_example_pred:
+         e1_b_off = current_pred['e1_b_off']
+         e1_e_off = current_pred['e1_e_off']
+         e2_b_off = current_pred['e2_b_off']
+         e2_e_off = current_pred['e2_e_off']
+
+         if e1_b_off != None:
+            exon1Begin.append(e1_b_off)
+            exon1End.append(e1_e_off)
+            exon2Begin.append(e2_b_off)
+            exon2End.append(e2_e_off)
+         else:
+            pass
+            #allWrongExons.append((newExons,exons))
+
+         if ambigous_match == True:
+            current_score = current_pred['DPScores'][0]
+            example_scores.append(current_score)
+
+         e1Begin_pos,e1Begin_neg,e1End_pos,e1End_neg,mean_e1Begin_neg,mean_e1End_neg = evaluatePositions(exon1Begin,exon1End)
+         e2Begin_pos,e2Begin_neg,e2End_pos,e2End_neg,mean_e2Begin_neg,mean_e2End_neg = evaluatePositions(exon2Begin,exon2End)
+
+      allDoubleScores.append(example_scores)
 
    all_pos_correct = 0
    for idx in range(len(exon1Begin)):
@@ -105,24 +121,13 @@ def collect_prediction(current_dir,run_name):
       and exon2Begin[idx] == 0 and exon2End[idx] == 0:
          all_pos_correct += 1
 
-
-   print 'Total num. of examples: %d' % len(allTrainPredictions)
+   print 'Total num. of examples: %d' % len(allPredictions)
    print 'Number of total correct examples: %d' % all_pos_correct
    print 'Correct positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_pos),len(e1End_pos),len(e2Begin_pos),len(e2End_pos))
    print 'Incorrect positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_neg),len(e1End_neg),len(e2Begin_neg),len(e2End_neg))
    print 'Mean of pos. offset:\t\t%.2f\t%.2f\t%.2f\t%.2f' % (mean_e1Begin_neg,mean_e1End_neg,mean_e2Begin_neg,mean_e2End_neg)
 
 
-   #filename =  jp(current_dir,run_name)+test_suffix
-   #allTestPredictions = cPickle.load(open(filename))
-
-   #current_prediction = {'e1_b_off':e1_b_off,'e1_e_off':e1_e_off,
-   #'e2_b_off':e2_b_off ,'e2_e_off':e2_e_off,\
-   #'predExons':newExons, 'trueExons':exons,\
-   #'dna':dna, 'est':est,\
-   #'DPScores':newDPScores }
-
-
 if __name__ == '__main__':
    dir = sys.argv[1]
    assert os.path.exists(dir), 'Error directory does not exist!'
index aa6d12c..15bd3bd 100644 (file)
@@ -156,6 +156,8 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
 
          alternative_seq = process_remapped_reads(reReads,currentFRead,dna_flat_files)
 
+         #check_remapped_reads(reReads,currentFRead,dna_flat_files)
+
          if seq == '':
             continue
 
@@ -167,10 +169,6 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
          OriginalEsts.append(original_est)
          Qualities.append(qual)
          UpCut.append(up_cut)
-         #SplitPositions.append(spos)
-         #Ids.append(id)
-         #FReads.append(currentFRead)
-
          AlternativeSequences.append(alternative_seq)
 
          instance_counter += 1
@@ -591,6 +589,39 @@ def process_remapped_reads(reReads,fRead,dna_flat_files):
 
    return alternativeSeq
 
+def check_remapped_reads(reReads,fRead,dna_flat_files):
+   """
+   For all matches found by Vmatch we calculate the fragment of the DNA we
+   would like to perform an aligment during prediction.
+   """
+
+   fragment_size = 3000
+
+   seq = fRead['seq']
+   strand = fRead['strand']
+   pos = fRead['p_start']
+   chr = fRead['chr']
+
+   print 'fRead:\t%d %d %d %d' %\
+   (fRead['p_start'],fRead['exon_stop'],fRead['exon_start'],fRead['p_stop'])
+
+   for currentRRead in reReads:
+      rId            = currentRRead['id']
+
+      pos1           = currentRRead['pos1']
+      chr1           = currentRRead['chr1']
+      seq1           = currentRRead['seq1']
+
+      seq2           = currentRRead['seq2']
+      pos2           = 0
+
+      if not seq2 == '':
+         pos2           = currentRRead['pos2']
+         chr2           = currentRRead['chr2']
+         s2_pos = seq.find(seq2)
+   
+      print 'reRead:\t %d %d %d %d'%(pos1,pos1+len(seq1),pos2,pos2+len(seq2))
+
 
 def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files):
    """
index 40b7237..05ec085 100644 (file)
@@ -722,7 +722,16 @@ class QPalma:
                if elem != -inf:
                   acc_supp[idx] = 0.0
 
-         # the code up to here is equal for all alternative positions
+         current_example_predictions = []
+
+         # first make a prediction on the dna fragment which comes from the ground truth                  
+         current_prediction = self.calc_alignment(dna, est, exons, quality, don_supp, acc_supp, d, a, h, mmatrix, qualityPlifs)
+         current_prediction['exampleIdx'] = exampleIdx
+
+         current_example_predictions.append(current_prediction)
+
+         # then make predictions for all dna fragments that where occurring in
+         # the vmatch results
          for alternative_alignment in currentAlternatives:
             chr, strand, genomicSeq_start, genomicSeq_stop, currentLabel = alternative_alignment
             currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
@@ -731,10 +740,9 @@ class QPalma:
             current_prediction = self.calc_alignment(dna, est, exons, quality, don_supp, acc_supp, d, a, h, mmatrix, qualityPlifs)
             current_prediction['exampleIdx'] = exampleIdx
 
-         current_prediction = self.calc_alignment(dna, est, exons, quality, don_supp, acc_supp, d, a, h, mmatrix, qualityPlifs)
-         current_prediction['exampleIdx'] = exampleIdx
+            current_example_predictions.append(current_prediction)
 
-         allPredictions.append(current_prediction)
+         allPredictions.append(current_example_predictions)
 
       # end of the prediction loop we save all predictions in a pickle file and exit
       cPickle.dump(allPredictions,open('%s_allPredictions_%s'%(run['name'],set_flag),'w+'))