+ fix changes we made so far
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Tue, 22 Apr 2008 08:44:58 +0000 (08:44 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Tue, 22 Apr 2008 08:44:58 +0000 (08:44 +0000)
+ right after final ECCB submission

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@8751 e1793c9e-67f9-0310-80fc-b846ff1f7b36

qpalma/parsers.py
scripts/Evaluation.py
scripts/Experiment.py
scripts/ModelSelection.py
scripts/compile_dataset.py
scripts/createNewDataset.py
scripts/qpalma_main.py
tools/data_tools/filterReads.c

index c84c136..4e9861a 100644 (file)
@@ -325,9 +325,9 @@ def parse_filtered_reads(filename):
 
       chr = int(chr)
 
-      prb = [ord(elem)-50 for elem in prb]
-      cal_prb = [ord(elem)-64 for elem in cal_prb]
-      chastity = [ord(elem)+10 for elem in chastity]
+      #prb = [ord(elem)-50 for elem in prb]
+      #cal_prb = [ord(elem)-64 for elem in cal_prb]
+      #chastity = [ord(elem)+10 for elem in chastity]
 
       p_start = int(p_start)
       exon_stop = int(exon_stop)
@@ -396,14 +396,13 @@ def parse_map_vm(filename):
       'mismatches':mismatches, 'length':length, 'offset':offset}
    
       if entries.has_key(id):
-         pdb.set_trace()
+         #pdb.set_trace()
          old_entry = entries[id]
          old_entry.append(line_d)
          entries[id] = old_entry
       else:
          entries[id] = [line_d]
 
-
    return entries
 
 
index d0d9e7d..5e5c66b 100644 (file)
@@ -343,38 +343,119 @@ def forall_experiments(current_func,tl_dir):
 
 def predict_on(filename,filtered_reads):
 
+   coverage_map = {}
+
+   for line in open('/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/coverage_results/ALL_COVERAGES'):
+      id,coverage_nr = line.strip().split()
+      coverage_map[int(id)] = int(coverage_nr)
+
    print 'parsing filtered reads..'
    all_filtered_reads = parse_filtered_reads(filtered_reads)
    print 'found %d filtered reads' % len(all_filtered_reads)
 
    allPredictions = cPickle.load(open(filename))
 
+   spliced_ctr = 0
+   unspliced_ctr = 0
+
    pos_correct_ctr   = 0
    pos_incorrect_ctr = 0
 
-   score_correct_ctr = 0
-   score_incorrect_ctr = 0
+   correct_spliced_ctr     = 0
+   correct_unspliced_ctr   = 0
+
+   incorrect_spliced_ctr     = 0
+   incorrect_unspliced_ctr   = 0
+
+   correct_covered_splice_ctr = 0
+   incorrect_covered_splice_ctr = 0
 
    total_vmatch_instances_ctr = 0
 
+   unspliced_spliced_reads_ctr = 0
+   wrong_spliced_reads_ctr = 0
+
+   wrong_aligned_unspliced_reads_ctr = 0
+   wrong_unspliced_reads_ctr = 0
+
    cut_pos_ctr = {}
 
+   total_ctr = 0
+   skipped_ctr = 0
+
+   is_spliced = False
+   min_coverage = 3
+
+   allUniqPredictions = {}
+
+   print 'Got %d predictions' % len(allPredictions)
+
+   for new_prediction in allPredictions:
+      id = new_prediction['id']
+      id = int(id)
 
-   for current_prediction in allPredictions:
+      if allUniqPredictions.has_key(id):
+         current_prediction = allUniqPredictions[id]
+
+         current_a_score = current_prediction['DPScores'].flatten().tolist()[0][0]
+         new_score =  new_prediction['DPScores'].flatten().tolist()[0][0]
+
+         if current_a_score < new_score :
+            allUniqPredictions[id] = new_prediction
+
+      else:
+         allUniqPredictions[id] = new_prediction
+
+   print 'Got %d uniq predictions' % len(allUniqPredictions)
+
+   #for current_prediction in allPredictions:
+   for _id,current_prediction in allUniqPredictions.items():
       id = current_prediction['id']
-      current_ground_truth = all_filtered_reads[id]
+      id = int(id)
+
+      if id < 1000000010006:
+         continue
+
+      if not id >= 1000000300000:
+         is_spliced = True
+      else:
+         is_spliced = False
+
+      is_covered = False
+
+      if is_spliced:
+         try:
+            current_coverage_nr = coverage_map[id]
+            is_covered = True
+         except:
+            is_covered = False
+
+
+      if is_spliced:
+         spliced_ctr += 1
+      else: 
+         unspliced_ctr += 1
+   
+      try:
+         current_ground_truth = all_filtered_reads[id]
+      except:
+         skipped_ctr += 1
+         continue
 
       start_pos = current_prediction['start_pos']
       chr = current_prediction['chr']
       strand = current_prediction['strand']
 
       #score = current_prediction['DPScores'].flatten().tolist()[0][0]
-
       #pdb.set_trace()
 
       predExons = current_prediction['predExons'] #:newExons, 'dna':dna, 'est':est
       predExons = [e+start_pos for e in predExons]
+
+      spliced_flag = False
+
       if len(predExons) == 4:
+         spliced_flag = True
          predExons[1] -= 1
          predExons[3] -= 1
 
@@ -388,17 +469,12 @@ def predict_on(filename,filtered_reads):
 
          if p_start == predExons[0] and e_stop == predExons[1] and\
          e_start == predExons[2] and p_stop == predExons[3]:
-            pos_correct_ctr += 1
+            pos_correct = True
          else:
-            pos_incorrect_ctr += 1
-            #pdb.set_trace()
-            try:
-               cut_pos_ctr[cut_pos] += 1
-            except:
-               cut_pos_ctr[cut_pos] = 1
-
+            pos_correct = False
 
       elif len(predExons) == 2:
+         spliced_flag = False
          predExons[1] -= 1
 
          cut_pos = current_ground_truth['true_cut']
@@ -407,36 +483,85 @@ def predict_on(filename,filtered_reads):
 
          true_cut = current_ground_truth['true_cut']
 
-         if p_start == predExons[0] and p_stop == predExons[1]:
-            pos_correct_ctr += 1
+         #pdb.set_trace()
+
+         if math.fabs(p_start - predExons[0]) <= 0:# and math.fabs(p_stop - predExons[1]) <= 2:
+            pos_correct = True
          else:
-            pos_incorrect_ctr += 1
-            #pdb.set_trace()
+            pos_correct = False
+
+      else:
+         pos_correct = False
+
+      if is_spliced and not spliced_flag:
+         unspliced_spliced_reads_ctr += 1
+
+      if is_spliced and not pos_correct and len(predExons) == 4 and predExons[1]!=-1:
+          wrong_spliced_reads_ctr += 1
+
+      if not is_spliced and spliced_flag:
+         wrong_unspliced_reads_ctr += 1
+
+      if not is_spliced and not pos_correct:
+         wrong_aligned_unspliced_reads_ctr += 1
+
+
+      if pos_correct:
+         pos_correct_ctr += 1
+
+         if is_spliced:
+               correct_spliced_ctr += 1
+               if is_covered and current_coverage_nr >= min_coverage:
+                  correct_covered_splice_ctr += 1 
+
+         if not is_spliced:
+               correct_unspliced_ctr += 1
 
       else:
-         pass
-      ## check whether the correct predictions score higher than the incorrect
-      ## ones
-      #cmp_res = compare_scores_and_labels(current_scores,current_labels)
-      #if cmp_res:
-      #   score_correct_ctr += 1
-      #else:
-      #   score_incorrect_ctr += 1
+         pos_incorrect_ctr += 1
+
+         if is_spliced:
+               incorrect_spliced_ctr += 1
+               if is_covered and current_coverage_nr >= min_coverage:
+                  incorrect_covered_splice_ctr += 1 
+
+         if not is_spliced:
+               incorrect_unspliced_ctr += 1
 
-   numPredictions = len(allPredictions)
+      if spliced_flag:
+          if not is_covered:
+              current_coverage_nr=0 
+          if pos_correct:
+              print "%s\tcorrect\t%i" %( current_prediction['id'], current_coverage_nr)
+          else:
+              print "%s\twrong\t%i" %( current_prediction['id'], current_coverage_nr)
+
+      total_ctr += 1
+
+
+   numPredictions = len(allUniqPredictions)
 
    # now that we have evaluated all instances put out all counters and sizes
    print 'Total num. of examples: %d' % numPredictions
-   print 'Correct pos: %2.3f, incorrect pos: %2.3f' %\
-   (pos_correct_ctr/(1.0*numPredictions),pos_incorrect_ctr/(1.0*numPredictions))
-   print cut_pos_ctr
 
-   #print 'Correct scores: %d, incorrect scores: %d' %\
-   #(score_correct_ctr,score_incorrect_ctr)
+   print "spliced/unspliced:  %d,%d " % (spliced_ctr, unspliced_ctr )
+   print "Correct/incorrect spliced: %d,%d " % (correct_spliced_ctr, incorrect_spliced_ctr )
+   print "Correct/incorrect unspliced: %d,%d " % (correct_unspliced_ctr , incorrect_unspliced_ctr )
+   print "Correct/incorrect covered spliced read: %d,%d " %\
+   (correct_covered_splice_ctr,incorrect_covered_splice_ctr)
+
+   print "pos_correct: %d,%d" % (pos_correct_ctr ,  pos_incorrect_ctr )
+
+   print 'unspliced_spliced reads: %d' % unspliced_spliced_reads_ctr 
+   print 'spliced reads at wrong_place: %d' % wrong_spliced_reads_ctr
+
+   print 'spliced_unspliced reads:  %d' % wrong_unspliced_reads_ctr
+   print 'wrong aligned at wrong_pos: %d' % wrong_aligned_unspliced_reads_ctr
 
-   #pos_error   = 1.0 * pos_incorrect_ctr / true_vmatch_instances_ctr
-   #score_error = 1.0 * score_incorrect_ctr / total_vmatch_instances_ctr
+   print 'total_ctr: %d' % total_ctr
 
+   print "skipped: %d "  % skipped_ctr
+   print 'min. coverage: %d' % min_coverage
 
 if __name__ == '__main__':
    #dir = sys.argv[1]
index 4d2d4ad..dd22238 100644 (file)
@@ -16,6 +16,18 @@ import pdb
 import os
 import os.path
 
+def get_dataset(dataset_size,num_nodes):
+   all_instances = []
+
+   params = [\
+   ('prediction_begin',400000),\
+   ('prediction_end',440000)]
+
+   all_instances.append(params)
+
+   return all_instances
+
+
 def get_dataset_splitting_instances(dataset_size,num_nodes):
    all_instances = []
 
@@ -73,15 +85,17 @@ def createRuns():
 
    allRuns = []
 
-   dataset_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/dataset_2nd'
+   dataset_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/dataset_full'
 
-   dataset_size = 400000
+   dataset_size = 500000
    num_nodes = 10
 
    ctr = 1
 
    #all_instances = get_scoring_instances()
    all_instances = get_dataset_splitting_instances(dataset_size,num_nodes)
+   #all_instances = get_dataset(dataset_size,num_nodes)
+   #pdb.set_trace()
             
    for parameters in all_instances:
       # create a new Run object
@@ -91,8 +105,8 @@ def createRuns():
       for param_name,param in parameters:
          currentRun[param_name] = param
          currentName += '%s_%s_' % (str(param_name),str(param))
-         print param_name,param
-         print currentName
+         #print param_name,param
+         #print currentName
             
       # global settings for all runs
       currentRun['anzpath']            = Conf.anzpath
@@ -113,7 +127,7 @@ def createRuns():
       currentRun['numQualSuppPoints']     = 10
       currentRun['totalQualSuppPoints']   = currentRun['numQualPlifs']*currentRun['numQualSuppPoints']
 
-      currentRun['enable_quality_scores'] = True
+      currentRun['enable_quality_scores'] = False
       currentRun['enable_splice_signals'] = True
       currentRun['enable_intron_length']  = True
 
index 28b59f6..81941f4 100644 (file)
@@ -5,6 +5,7 @@ import cPickle
 import os
 import os.path
 import time
+import pdb
 import subprocess
 
 from qpalma_main import QPalma
@@ -21,7 +22,9 @@ class Model:
 
       allRuns = Exp.createRuns()
       
-      for currentRun in allRuns[0:1]:
+      pdb.set_trace()
+      
+      for currentRun in allRuns[9:]:
                
          currentInstance = QPalma(currentRun)
 
index 46e5434..839c91f 100644 (file)
@@ -78,14 +78,16 @@ def compile_d2(gff_file,dna_flat_files,filtered_reads,remapped_reads,dataset_fil
    for i in range(1,6):
       allGenes[i]= parse_gff(gff_file%i) 
 
-   print 'parsing filtered reads..'
-   all_filtered_reads = parse_filtered_reads(filtered_reads)
-   print 'found %d filtered reads' % len(all_filtered_reads)
-
    print 'parsing map reads'
    all_remapped_reads = parse_map_vm(remapped_reads)
    print 'found %d remapped reads' % len(all_remapped_reads)
 
+   #pdb.set_trace()
+
+   print 'parsing filtered reads..'
+   all_filtered_reads = parse_filtered_reads(filtered_reads)
+   print 'found %d filtered reads' % len(all_filtered_reads)
+
    # this stores the new dataset
    SeqInfo = []
    OriginalEsts = []
@@ -95,6 +97,7 @@ def compile_d2(gff_file,dna_flat_files,filtered_reads,remapped_reads,dataset_fil
    # training / prediction example
    instance_counter = 0
    skipped_ctr = 0
+   num_missed_reads  = 0
 
    for reId,allreReads in all_remapped_reads.items():
 
@@ -102,8 +105,11 @@ def compile_d2(gff_file,dna_flat_files,filtered_reads,remapped_reads,dataset_fil
             currentFRead = all_filtered_reads[reId]
          except:
             skipped_ctr +=1 
+            num_missed_reads += len(allreReads) 
             continue
 
+         #pdb.set_trace()
+
          for reRead in allreReads:
                
             if instance_counter % 1000 == 0:
@@ -128,9 +134,9 @@ def compile_d2(gff_file,dna_flat_files,filtered_reads,remapped_reads,dataset_fil
 
             instance_counter += 1
 
-
    print 'Full dataset has size %d' % len(SeqInfo)
    print 'Skipped %d reads' % skipped_ctr
+   print 'Num. missed reads %d reads' % num_missed_reads
 
    dataset = [SeqInfo, OriginalEsts, Qualities ]
 
index 87aa533..15608de 100644 (file)
@@ -7,8 +7,10 @@ from compile_dataset import compile_d2
 #filtered_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/allReads.full'
 #remapped_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/map.vm_cut'
 
-filtered_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/paper_data/allReads.full.newid'
 #remapped_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/paper_data/spliced.result_no_comm'
+
+
+filtered_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/paper_data/allReads.full.newid'
 remapped_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/paper_data/map_2nd.vm_chr'
 
 compile_d2(Conf.gff_fn,Conf.dna_flat_fn,filtered_fn,remapped_fn,'dataset_2nd',True)
index 23b66af..8b52328 100644 (file)
@@ -728,6 +728,8 @@ class QPalma:
 
       totalQualityPenalties = zeros((totalQualSP,1))
 
+      problem_ctr = 0
+
       # where we store the predictions
       allPredictions = []
 
@@ -767,7 +769,11 @@ class QPalma:
          if not chr in range(1,6):
             continue
 
-         currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
+         try:
+            currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
+         except:
+            problem_ctr += 1
+            continue
 
          if not run['enable_splice_signals']:
             for idx,elem in enumerate(currentDon):
@@ -792,6 +798,7 @@ class QPalma:
       # end of the prediction loop we save all predictions in a pickle file and exit
       cPickle.dump(allPredictions,open('%s_allPredictions_%s'%(run['name'],set_flag),'w+'))
       print 'Prediction completed'
+      print 'Problem ctr %d' % problem_ctr
 
 
    def calc_alignment(self, dna, est, quality, don_supp, acc_supp, d, a, h, mmatrix, qualityPlifs):
index b9300d2..c0431aa 100644 (file)
@@ -218,8 +218,8 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
          if ( prev_exon_start <= read_start && read_stop <= prev_exon_stop ) { // read is inside previous exon
 
             // output of unused i.e. unspliced reads
-            fprintf(unfiltered_out_fs,"%lu\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\n",
-            unfiltered_read_nr,chr,strand,seq,0,read_size,prb,cal_prb,chastity,gene_id,read_start,-1,-1,read_stop);
+            fprintf(unfiltered_out_fs,"%lu\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\n",
+            unfiltered_read_nr,chr,strand,seq,0,read_size,prb,cal_prb,chastity,gene_id,read_start,-1,-1,read_stop,-1);
             unfiltered_read_nr++;
             
             exonicReadCtr++;