total_vmatch_instances_ctr = 0
+ cut_pos_ctr = {}
+
for current_prediction in allPredictions:
id = current_prediction['id']
else:
pos_incorrect_ctr += 1
#pdb.set_trace()
+ try:
+ cut_pos_ctr[cut_pos] += 1
+ except:
+ cut_pos_ctr[cut_pos] = 1
+
elif len(predExons) == 2:
predExons[1] -= 1
print 'Total num. of examples: %d' % numPredictions
print 'Correct pos: %2.3f, incorrect pos: %2.3f' %\
(pos_correct_ctr/(1.0*numPredictions),pos_incorrect_ctr/(1.0*numPredictions))
+ print cut_pos_ctr
#print 'Correct scores: %d, incorrect scores: %d' %\
#(score_correct_ctr,score_incorrect_ctr)
import os
import os.path
+def get_dataset_splitting_instances(dataset_size,num_nodes):
+ all_instances = []
+
+ part = dataset_size / num_nodes
+ begin = 0
+ end = 0
+ for idx in range(1,num_nodes+1):
+
+ if idx == num_nodes:
+ begin = end
+ end = dataset_size
+ else:
+ begin = end
+ end = begin+part
+
+ params = [\
+ ('prediction_begin',begin),\
+ ('prediction_end',end)]
+
+ all_instances.append(params)
+
+ return all_instances
+
+
+def get_scoring_instances():
+ all_instances = []
+
+ for QFlag in [True,False]:
+ for SSFlag in [True,False]:
+ for ILFlag in [True,False]:
+ params = [\
+ ('enable_quality_scores',QFlag),\
+ ('enable_splice_signals',SSFlag),\
+ ('enable_intron_length',ILFlag)]
+ all_instances.append(params)
+
+ return all_instances
+
+
def createRuns():
# specify n for n-fold cross validation
numFolds=5
# the main directory where all results are stored
- experiment_dir = '/fml/ag-raetsch/home/fabio/tmp/QPalmaTraining'
+ experiment_dir = '/fml/ag-raetsch/home/fabio/tmp/QPalma_single_run'
assert os.path.exists(experiment_dir), 'toplevel dir for experiment does not exist!'
allRuns = []
- dataset_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/dataset_remapped_test_new'
+ dataset_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/dataset_2nd'
- for QFlag in [True,False]:
- for SSFlag in [True,False]:
- for ILFlag in [True,False]:
+ dataset_size = 400000
+ num_nodes = 10
+
+ ctr = 1
+
+ #all_instances = get_scoring_instances()
+ all_instances = get_dataset_splitting_instances(dataset_size,num_nodes)
- # create a new Run object
- currentRun = Run()
+ for parameters in all_instances:
+ # create a new Run object
+ currentRun = Run()
+ currentName = 'run_'
+
+ for param_name,param in parameters:
+ currentRun[param_name] = param
+ currentName += '%s_%s_' % (str(param_name),str(param))
+ print param_name,param
+ print currentName
- # global settings for all runs
- currentRun['anzpath'] = Conf.anzpath
- currentRun['iter_steps'] = Conf.iter_steps
- currentRun['matchmatrixRows'] = Conf.sizeMatchmatrix[0]
- currentRun['matchmatrixCols'] = Conf.sizeMatchmatrix[1]
- currentRun['mode'] = Conf.mode
- currentRun['numConstraintsPerRound'] = Conf.numConstraintsPerRound
+ # global settings for all runs
+ currentRun['anzpath'] = Conf.anzpath
+ currentRun['iter_steps'] = Conf.iter_steps
+ currentRun['matchmatrixRows'] = Conf.sizeMatchmatrix[0]
+ currentRun['matchmatrixCols'] = Conf.sizeMatchmatrix[1]
+ currentRun['mode'] = Conf.mode
+ currentRun['numConstraintsPerRound'] = Conf.numConstraintsPerRound
- currentRun['remove_duplicate_scores'] = Conf.remove_duplicate_scores
- currentRun['print_matrix'] = Conf.print_matrix
- currentRun['read_size'] = Conf.read_size
+ currentRun['remove_duplicate_scores'] = Conf.remove_duplicate_scores
+ currentRun['print_matrix'] = Conf.print_matrix
+ currentRun['read_size'] = Conf.read_size
-
- currentRun['numLengthSuppPoints'] = 10 #Conf.numLengthSuppPoints
+ currentRun['numDonSuppPoints'] = 10
+ currentRun['numAccSuppPoints'] = 10
- # if we are not using an intron length model at all we do not need the support points
- if ILFlag == False:
- currentRun['numLengthSuppPoints'] = 2 #Conf.numLengthSuppPoints
+ currentRun['numQualPlifs'] = Conf.numQualPlifs
+ currentRun['numQualSuppPoints'] = 10
+ currentRun['totalQualSuppPoints'] = currentRun['numQualPlifs']*currentRun['numQualSuppPoints']
- currentRun['numDonSuppPoints'] = 10
- currentRun['numAccSuppPoints'] = 10
+ currentRun['enable_quality_scores'] = True
+ currentRun['enable_splice_signals'] = True
+ currentRun['enable_intron_length'] = True
- currentRun['numQualPlifs'] = Conf.numQualPlifs
- currentRun['numQualSuppPoints'] = 10
- currentRun['totalQualSuppPoints'] = currentRun['numQualPlifs']*currentRun['numQualSuppPoints']
+ # if we are not using an intron length model at all we do not need the support points
+ currentRun['numLengthSuppPoints'] = 10 #Conf.numLengthSuppPoints
- currentRun['numFeatures'] = currentRun['numLengthSuppPoints']\
- + currentRun['numDonSuppPoints'] + currentRun['numAccSuppPoints']\
- + currentRun['matchmatrixRows'] * currentRun['matchmatrixCols']\
- + currentRun['totalQualSuppPoints']
+ if currentRun['enable_intron_length'] == False:
+ currentRun['numLengthSuppPoints'] = 2 #Conf.numLengthSuppPoints
- # run-specific settings
- currentRun['training_begin'] = Conf.training_begin
- currentRun['training_end'] = Conf.training_end
- currentRun['prediction_begin'] = Conf.prediction_begin
- currentRun['prediction_end'] = Conf.prediction_end
+ currentRun['numFeatures'] = currentRun['numLengthSuppPoints']\
+ + currentRun['numDonSuppPoints'] + currentRun['numAccSuppPoints']\
+ + currentRun['matchmatrixRows'] * currentRun['matchmatrixCols']\
+ + currentRun['totalQualSuppPoints']
- currentRun['enable_quality_scores'] = QFlag
- currentRun['enable_splice_signals'] = SSFlag
- currentRun['enable_intron_length'] = ILFlag
+ # run-specific settings
+ currentRun['training_begin'] = Conf.training_begin
+ currentRun['training_end'] = Conf.training_end
+ #currentRun['prediction_begin'] = Conf.prediction_begin
+ #currentRun['prediction_end'] = Conf.prediction_end
- currentName = 'run_%s_quality_%s_splicesignals_%s_intron_len' %\
- (bool2str[QFlag],bool2str[SSFlag],bool2str[ILFlag])
+ currentRun['C'] = 100
- currentRun['C'] = 100
+ currentRun['name'] = currentName
+ currentRun['dataset_filename'] = dataset_filename
+ currentRun['experiment_path'] = experiment_dir
- currentRun['name'] = currentName
- currentRun['dataset_filename'] = dataset_filename
- currentRun['experiment_path'] = experiment_dir
+ currentRun['min_intron_len'] = 20
+ currentRun['max_intron_len'] = 2000
- currentRun['min_intron_len'] = 20
- currentRun['max_intron_len'] = 2000
+ currentRun['min_svm_score'] = 0.0
+ currentRun['max_svm_score'] = 1.0
- #currentRun['min_intron_len'] = 10
- #currentRun['max_intron_len'] = 100
+ currentRun['min_qual'] = -5
+ currentRun['max_qual'] = 40
- currentRun['min_svm_score'] = 0.0
- currentRun['max_svm_score'] = 1.0
+ currentRun['dna_flat_files'] = Conf.dna_flat_fn
- currentRun['min_qual'] = -5
- currentRun['max_qual'] = 40
+ currentRun['id'] = ctr
+ ctr += 1
- currentRun['dna_flat_files'] = Conf.dna_flat_fn
+ allRuns.append(currentRun)
+
+
+###############################################################################
+#
+# check for valid paths / options etc
+#
+###############################################################################
- allRuns.append(currentRun)
- #
- # check for valid paths / options etc
- #
for currentRun in allRuns:
assert 0 < currentRun['anzpath'] < 100
assert 0 <= currentRun['training_begin'] < currentRun['training_end']
- assert currentRun['training_end'] <= currentRun['prediction_begin'] < currentRun['prediction_end']
+ assert currentRun['training_begin'] < currentRun['training_end']
+ assert currentRun['prediction_begin'] < currentRun['prediction_end']
assert currentRun['iter_steps']
if __name__ == '__main__':
allRuns = createRuns()
- pdb.set_trace()
+ #pdb.set_trace()
from genome_utils import load_genomic
# only for checking purposes
-from compile_dataset import get_seq_and_scores
+#from compile_dataset import get_seq_and_scores
class Lookup:
self.max_size = 40000000
self.chromosomes = range(1,6)
- #self.chromosomes = [1]
-
- strands = ['+','-']
+ self.strands = ['+']
self.genomicSequencesP = [0]*(len(self.chromosomes)+1)
self.acceptorScoresP = [0]*(len(self.chromosomes)+1)
self.donorScoresN = [0]*(len(self.chromosomes)+1)
for chrId in self.chromosomes:
- for strandId in strands:
+ for strandId in self.strands:
print (chrId,strandId)
self.prefetch_seq_and_scores(chrId,strandId)
def get_seq_and_scores(self,chr,strand,start,stop,dna_flat_files):
assert chr in self.chromosomes
+ assert strand in self.strands
- if strand == '+'
+ if strand == '+':
currentSequence = self.genomicSequencesP[chr][start-1:stop]
length = len(currentSequence)
- currentDonorScores = self.donorScoresP[chr][start:stop]
- currentDonorScores += [-inf]*(length-len(currentDonorScores))
- currentAcceptorScores = self.acceptorScoresP[chr][start:stop+2]
- currentAcceptorScores = currentAcceptorScoresP[1:]
- elif:
+ currentDonorScores = self.donorScoresP[chr][start:stop]
+ currentDonorScores += [-inf]*(length-len(currentDonorScores))
+ currentAcceptorScores = self.acceptorScoresP[chr][start:stop+2]
+ currentAcceptorScores = currentAcceptorScores[1:]
+ elif strand == '-':
currentSequence = self.genomicSequencesN[chr][start-1:stop]
length = len(currentSequence)
- currentDonorScores = self.donorScoresN[chr][start:stop]
- currentDonorScores += [-inf]*(length-len(currentDonorScores))
- currentAcceptorScores = self.acceptorScoresN[chr][start:stop+2]
- currentAcceptorScores = currentAcceptorScoresN[1:]
+ currentDonorScores = self.donorScoresN[chr][start:stop]
+ currentDonorScores += [-inf]*(length-len(currentDonorScores))
+ currentAcceptorScores = self.acceptorScoresN[chr][start:stop+2]
+ currentAcceptorScores = currentAcceptorScores[1:]
+ else:
+ pdb.set_trace()
- return currentSequence, currentAcceptorScores, currentDonorScores
+ return currentSequence.lower(), currentAcceptorScores, currentDonorScores
def prefetch_seq_and_scores(self,chr,strand):
currentAcc, currentDon = self.getSpliceScores(chr,strand,intervalBegin,intervalEnd)
- if strand == '+'
+ if strand == '+':
self.genomicSequencesP[chr] = genomicSeq
self.acceptorScoresP[chr] = currentAcc
self.donorScoresP[chr] = currentDon
- elif:
+ elif strand == '-':
self.genomicSequencesN[chr] = genomicSeq
self.acceptorScoresN[chr] = currentAcc
self.donorScoresN[chr] = currentDon
import os
import os.path
import time
+import subprocess
from qpalma_main import QPalma
import Experiment as Exp
allRuns = Exp.createRuns()
- for currentRun in allRuns:
+ for currentRun in allRuns[0:1]:
currentInstance = QPalma(currentRun)
self.allInstances.append(instance_fn)
+
def doSelection(self):
for instance in self.allInstances:
time.sleep(2)
#os.system('echo "./resurrect %s >out_%s.log 2>err_%s.log &"'%(instance,instance,instance))
+ def doPrediction(self):
+ current_dir = '/fml/ag-raetsch/home/fabio/tmp/QPalma_single_run'
+
+ for instance in self.allInstances:
+ time.sleep(2)
+ cmd = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/doPrediction.sh %s 1>_%s.log 2>%s.log & '%(instance,instance,instance)
+ #cmd = 'echo /fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/doPrediction.sh %s |\
+ #qsub -l h_vmem=12.0G -cwd -j y -N \"%s.log\"'%(instance,instance)
+ #os.system(cmd)
+ obj = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
+ #out,err = obj.communicate()
+ #print out,err
+
+
+
if __name__ == '__main__':
m = Model()
m.createInstances()
- m.doSelection()
+ #m.doSelection()
+ m.doPrediction()
dna_flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
start = cpu()
- self.all_remapped_reads = parse(data_fname)
+ self.all_remapped_reads = parse_map_vm_heuristic(data_fname)
stop = cpu()
print 'parsed %d reads in %f sec' % (len(self.all_remapped_reads),stop-start)
currentVMatchAlignment = dna, exons, est, original_est, quality,\
currentAcc, currentDon
- alternativeAlignmentScores = self.calcAlternativeAlignments(location)
+ try:
+ alternativeAlignmentScores = self.calcAlternativeAlignments(location)
+ except:
+ alternativeAlignmentScores = []
+
if alternativeAlignmentScores == []:
# no alignment necessary
stop = cpu()
self.get_time += stop-start
dna = currentDNASeq
-
+
proximal_acc,proximal_don,distal_acc,distal_don = self.findHighestScoringSpliceSites(currentAcc, currentDon, dna, self.max_intron_size, len(est), self.splice_thresh)
alternativeScores = []
if instance_counter % 1000 == 0:
print 'processed %d examples' % instance_counter
- #if instance_counter == 400000:
- # break
-
if reRead['strand'] != '+':
continue
-
- #seq_info, exons, cut_pos =\
- # process_filtered_read(currentFRead,dna_flat_files)
(id,chr,strand,genomicSeq_start,genomicSeq_stop) =\
process_map(reRead,currentFRead,dna_flat_files)
import cPickle
import sys
-import pydb
import pdb
import os
import os.path
import math
+from qpalma.parsers import *
-def prediction_on(filename):
+def prediction_on(filename,filtered_reads,out_fname):
- allPredictions = cPickle.load(open(filename))
+ print 'parsing filtered reads..'
+ all_filtered_reads = parse_filtered_reads(filtered_reads)
+ print 'found %d filtered reads' % len(all_filtered_reads)
+
+ out_fh = open(out_fname,'w+')
+ allPredictions = cPickle.load(open(filename))
allPositions = {}
+ for current_prediction in allPredictions:
+ id = current_prediction['id']
+ current_ground_truth = all_filtered_reads[id]
- for current_example_pred in allPredictions:
- gt_example = current_example_pred[0]
+ true_cut = current_ground_truth['true_cut']
+ seq = current_ground_truth['seq']
+ q1 = current_ground_truth['prb']
- exampleId = gt_example['id']
+ chr = current_prediction['chr']
+ strand = current_prediction['strand']
+ #true_cut = current_prediction['true_cut']
+ start_pos = current_prediction['start_pos']
+ alignment = current_prediction['alignment']
- for elem_nr,current_pred in enumerate(current_example_pred[1:]):
- exampleId = current_pred['id']
- chr = current_pred['chr']
- strand = current_pred['strand']
- true_cut = current_pred['true_cut']
- start_pos = current_pred['alternative_start_pos']
- alignment = current_pred['alignment']
+ predExons = current_prediction['predExons']
- predExons = current_pred['predExons']
- trueExons = current_pred['trueExons']
- predPositions = [elem + current_pred['alternative_start_pos'] for elem in predExons]
- truePositions = [elem + current_pred['start_pos'] for elem in trueExons.flatten().tolist()[0]]
+ if len(predExons) == 4:
+ p1 = predExons[0]
+ p2 = predExons[1]
+ p3 = predExons[2]
+ p4 = predExons[3]
- if len(predExons) == 4:
- p1 = predExons[0]
- p2 = predExons[1]
- p3 = predExons[2]
- p4 = predExons[3]
+ elif len(predExons) == 2:
+ p1 = predExons[0]
+ p2 = -1
+ p3 = -1
+ p4 = predExons[1]
- #line = '%d\t%d\t%d\t%d\t%d\n' % (exampleId,p1,p2,p3,p4)
- #print line
- allPositions[exampleId] = (chr,strand,start_pos,true_cut,p1,p2,p3,p4,alignment)
+ #allPositions[exampleId] = (chr,strand,start_pos,true_cut,p1,p2,p3,p4,alignment)
+ (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
+ tExonSizes,tStarts, tEnds) = alignment
+
+ new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
+ (id,chr,strand,seq,q1,start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
+ str(qExonSizes)[1:-1].replace(' ',''),str(qStarts)[1:-1].replace(' ',''),\
+ str(qEnds)[1:-1].replace(' ',''),str(tExonSizes)[1:-1].replace(' ',''),\
+ str(tStarts)[1:-1].replace(' ',''),str(tEnds)[1:-1].replace(' ',''))
+
+ out_fh.write(new_line)
return allPositions
def writePredictions(fname,allPositions):
- out_fh = open(fname,'w+')
allEntries = {}
for id,elems in allPositions.items():
- id += 1000000000000
seq,q1,q2,q3 = allEntries[id]
chr,strand,start_pos,true_cut,p1,p2,p3,p4,alignment = elems
p3 += start_pos
p4 += start_pos
- #pdb.set_trace()
- (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
- tExonSizes,tStarts, tEnds) = alignment
-
- new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
- (id,chr,strand,seq,q1,start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
- str(qExonSizes)[1:-1].replace(' ',''),str(qStarts)[1:-1].replace(' ',''),\
- str(qEnds)[1:-1].replace(' ',''),str(tExonSizes)[1:-1].replace(' ',''),\
- str(tStarts)[1:-1].replace(' ',''),str(tEnds)[1:-1].replace(' ',''))
-
- out_fh.write(new_line)
out_fh.close()
jp = os.path.join
b2s = ['-','+']
- currentRun = cPickle.load(open(jp(current_dir,'run_object_1.pickle')))
- QFlag = currentRun['enable_quality_scores']
- SSFlag = currentRun['enable_splice_signals']
- ILFlag = currentRun['enable_intron_length']
- currentRunId = '%s%s%s' % (b2s[QFlag],b2s[SSFlag],b2s[ILFlag])
+ #currentRun = cPickle.load(open(jp(current_dir,'run_object_1.pickle')))
+ #QFlag = currentRun['enable_quality_scores']
+ #SSFlag = currentRun['enable_splice_signals']
+ #ILFlag = currentRun['enable_intron_length']
+ #currentRunId = '%s%s%s' % (b2s[QFlag],b2s[SSFlag],b2s[ILFlag])
- #filename = jp(current_dir,run_name)+train_suffix
- #print 'Prediction on: %s' % filename
- #prediction_on(filename)
-
filename = jp(current_dir,run_name)+test_suffix
print 'Prediction on: %s' % filename
test_result = prediction_on(filename)
fname = 'predictions.txt'
writePredictions(fname,test_result)
- #return train_result,test_result,currentRunId
-
if __name__ == '__main__':
- dir = sys.argv[1]
- #run_name = sys.argv[2]
- collect_prediction(dir)
+ filename = sys.argv[1]
+ filtered_reads = sys.argv[2]
+ out_fh = sys.argv[3]
+ prediction_on(filename,filtered_reads,out_fh)
# -*- coding: utf-8 -*-
import qpalma.Configuration as Conf
-from compile_dataset import compile_d
+from compile_dataset import compile_d2
+#filtered_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/allReads.full'
+#remapped_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/map.vm_cut'
-#filtered_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/allFilteredReads_20_03_2008'
+filtered_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/paper_data/allReads.full.newid'
+#remapped_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/paper_data/spliced.result_no_comm'
+remapped_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/paper_data/map_2nd.vm_chr'
-filtered_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/allReads.full.perm'
-remapped_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/map.vm_cut'
-
-compile_d(Conf.gff_fn,Conf.dna_flat_fn,filtered_fn,remapped_fn,Conf.tmp_dir,'dataset_remapped_test_new',True)
+compile_d2(Conf.gff_fn,Conf.dna_flat_fn,filtered_fn,remapped_fn,'dataset_2nd',True)