fill_matrix.cpp\
qpalma_dp.cpp\
result_align.cpp\
+ debug_tools.cpp\
penalty_info.cpp\
print_align.cpp\
io.cpp
#CXXFLAGS=-O3 -fPIC
#CXXFLAGS=-O3 -fPIC -pg -fprofile-arcs
-CXXFLAGS=-O3 -fPIC -ggdb
+
+#CXXFLAGS=-O3 -fPIC -ggdb -pg
+CXXFLAGS=-O3 -fPIC -ggdb #-fno-inline
IF=QPalmaDP
#include <cstdio>
-void fassert(bool exp,int line, char* file) {
- if (! exp) {
- printf("invalid fassert at line %d in file %s!\n",line,file);
- exit(EXIT_FAILURE);
- }
-}
+void fassert(bool exp,int line, char* file);
+
+#endif // __DEBUG_TOOLS__
#define FA(expr) (fassert(expr,__LINE__,__FILE__))
-#endif // __DEBUG_TOOLS__
#define D_USE_QUALITY_SCORES 1
-void fassert(bool exp,int line, char* file);
-#define FA(expr) (fassert(expr,__LINE__,__FILE__))
-
-using namespace std;
-
int check_char(char base)
{
switch(base)
#define _QPALMA_DP_H_
#include "penalty_info.h"
+#include "debug_tools.h"
struct ArrayElem { //24B
double score;
void fill_matrix (int nr_paths, Pre_score* matrices[], int len_est, int len_dna, char* est, char* dna, double* prb, penalty_struct* functions, double* matchmatrix, penalty_struct* qualityScores, double* donor, double* acceptor, bool remove_duplicate_scores, int nr_donor_sites, int* donor_sites, int* max_score_positions);
int check_char(char base);
-void fassert(bool exp,int line, char* file);
-#define FA(expr) (fassert(expr,__LINE__,__FILE__))
bool result_align (Pre_score* matrices[], int matrixnr, int i, int j, int* result_length_ptr, char* est, char* dna, double* prb, double* chastity, int* s_align, int* e_align, int* m_param, double* alignmentscores, int* max_score_positions, penalty_struct* qparam, mode currentMode);
from numpy.matlib import mat,zeros,ones,inf
import cPickle
import pdb
+#import pydb
import Configuration as Conf
import tools
from tools.PyGff import *
import sys
from genome_utils import load_genomic
+from Genefinding import *
def paths_load_data(filename,expt,genome_info,PAR,test=False):
- data = io_pickle.load(filename)
+ #data = io_pickle.load(filename)
+ data = cPickle.load(open(filename))
- #cPickle.dump(data,open(filename+'.alt','w+'))
- #data = cPickle.load(open(filename+'.alt'))
+ Sequences = data[0]
+ Acceptors = data[1]
+ Donors = data[2]
+ Exons = data[3]
+ Ests = data[4]
+ OriginalEsts= data[5]
+ Qualities = data[6]
+ UpCut = data[7]
+ StartPos = data[8]
+ AlternativeSequences = data[9]
- Sequences = data['Sequences']
- Acceptors = data['Acceptors']
- Donors = data['Donors']
- Exons = data['Exons']
- Ests = data['Ests']
- OriginalEsts= data['OriginalEsts']
- Qualities = data['Qualities']
- SplitPositions = data['SplitPositions']
-
- return Sequences, Acceptors, Donors, Exons, Ests, OriginalEsts, Qualities, SplitPositions
+ return Sequences, Acceptors, Donors, Exons, Ests, OriginalEsts, Qualities,\
+ UpCut, StartPos, AlternativeSequences
def paths_load_data_solexa(expt,genome_info,PAR,test=False):
# now we want to use interval_query to get the predicted splice scores
# trained on the TAIR sequence and annotation
- from Genefinding import *
interval_matrix = createIntArrayFromList([currentGene.start+up_cut,currentGene.start+down_cut])
num_fields = 1
import logging
logging.basicConfig(level=logging.DEBUG,format='%(levelname)s %(message)s')
-import qpalma.Configuration as Configuration
-
class SIQP:
"""
This class is a wrapper for the cvxopt/cplex qp solvers.
"""
- def __init__(self,fSize,numExamples,c):
+ def __init__(self,fSize,numExamples,c,run):
assert fSize > 0, 'You have to have at least one feature!'
assert numExamples > 0, 'You have to have at least one example!'
self.numFeatures = fSize
self.C = c
self.EPSILON = 10e-15
+ self.run = run
+
logging.debug("Starting SIQP solver with %d examples. A feature space dim. of %d.", numExamples,fSize)
logging.debug("Regularization parameters are C=%f."%(self.C))
# end of zeroing regularizer
def createSmoothnessRegularizer(self):
- # set regularizer to zero
+ run = self.run
+
self.P = cb.matrix(0.0,(self.dimP,self.dimP))
for i in range(self.numFeatures):
self.P[i,i] = 1.0
- lengthSP = Configuration.numLengthSuppPoints
- donSP = Configuration.numDonSuppPoints
- accSP = Configuration.numAccSuppPoints
- mmatrixSP = Configuration.sizeMatchmatrix[0]\
- *Configuration.sizeMatchmatrix[1]
- numq = Configuration.numQualSuppPoints
- totalQualSP = Configuration.totalQualSuppPoints
- numQPlifs = Configuration.numQualPlifs
+ lengthSP = run['numLengthSuppPoints']
+ donSP = run['numDonSuppPoints']
+ accSP = run['numAccSuppPoints']
+ mmatrixSP = run['matchmatrixRows']*run['matchmatrixCols']
+ numq = run['numQualSuppPoints']
+ totalQualSP = run['totalQualSuppPoints']
+ numQPlifs = run['numQualPlifs']
#lengthGroupParam = 5*1e-9
#spliceGroupParam = 1e-9
# Define types and constants
Inf = CPX_INFBOUND
- def __init__(self,fSize,numExamples,c,proto):
- SIQP.__init__(self,fSize,numExamples,c)
+ def __init__(self,fSize,numExamples,c,proto,run):
+ SIQP.__init__(self,fSize,numExamples,c,run)
self.numFeatures = fSize
self.numExamples = numExamples
self.numVariables = self.numFeatures + self.numExamples
self.ub = FloatMatrix([self.Inf] * self.numVariables).data
self.lb = FloatMatrix([-self.Inf] * self.numFeatures + [0.0]*self.numExamples).data
+ # set intron length to zero
+ #import pdb
+ #pdb.set_trace()
+
+ #self.ub[0] = 0.0
+ #self.ub[1] = 0.0
+
+ #self.lb[0] = 0.0
+ #self.lb[1] = 0.0
+
+ #pdb.set_trace()
+
+ ############################
+
self.matbeg = IntMatrix([1]*self.numVariables).data
self.matbeg[0] = 0
from numpy.matlib import mat,zeros,ones,inf
import pdb
from Plif import Plf,base_coord,linspace
-import Configuration as Conf
-def computeSpliceAlignWithQuality(dna, exons, est, original_est, quality, qualityPlifs):
+import Helpers
+
+def computeSpliceAlignWithQuality(dna, exons, est, original_est, quality, qualityPlifs, run):
"""
Exonpos: Anfang Exon bis 1 rechts neben Ende Exon
Daraus folgt: SpliceposAnfang ist gleich Ende Exon von Vorgaenger
# start of label feature generation
#
- sizeMatchmatrix = Conf.sizeMatchmatrix
+ sizeMatchmatrix = run['matchmatrixRows']*run['matchmatrixCols'] #
# counts the occurences of a,c,g,t,n in this order
# counts the occurrences of a,c,g,t,n with their quality scores
for row in range(5):
for col in range(6):
- trueWeightQualityMat[row][col] = [0.0]*Conf.numQualSuppPoints # supp. points per plif
+ trueWeightQualityMat[row][col] = [0.0]*run['numQualSuppPoints'] # supp. points per plif
dna_part = []
est_part = []
orig_pos += 1
for orig_char in dna_part:
- assert orig_char in Conf.alphabet, pdb.set_trace()
+ assert orig_char in Helpers.alphabet, pdb.set_trace()
for orig_char in est_part:
- assert orig_char in Conf.alphabet, pdb.set_trace()
+ assert orig_char in Helpers.alphabet, pdb.set_trace()
- trueWeightMatch = zeros((sizeMatchmatrix[0]*sizeMatchmatrix[1],1)) # Scorematrix fuer Wahrheit
+ trueWeightMatch = zeros((run['matchmatrixRows']*run['matchmatrixCols'],1)) # Scorematrix fuer Wahrheit
map = {'-':0, 'a':1, 'c':2, 'g':3, 't':4, 'n':5}
trueWeightQualityMat[estnum-1][dnanum][Upper] += weightup
trueWeightQualityMat[estnum-1][dnanum][Lower] += weightlow
- trueWeightQuality = zeros((Conf.numQualSuppPoints*Conf.numQualPlifs,1))
+ trueWeightQuality = zeros((run['numQualSuppPoints']*run['numQualPlifs'],1))
ctr = 0
for row in range(5):
for col in range(6):
- for p_idx in range(Conf.numQualSuppPoints):
+ for p_idx in range(run['numQualSuppPoints']):
#print ctr, row, col, p_idx
trueWeightQuality[ctr] = trueWeightQualityMat[row][col][p_idx]
ctr += 1
splitpos = int(splitpos)
read_size = int(read_size)
+ seq=seq.lower()
+
assert strand in ['D','P']
if strand == 'D':
We assume that a line has the following entires:
"""
- id,chr,pos,strand,mismatches,align_len,offset,alignment = line.split()
- pos = int(pos)
- mismatches = int(mismatches)
- align_len = int(align_len)
- offset = int(offset)
- line_d = {'id':id, 'chr':chr, 'pos':pos, 'strand':strand, 'mismatches':mismatches, 'align_len':align_len,\
- 'offset':offset, 'alignment':alignment}
+ id,chr1,pos1,seq1,chr2,pos2,seq2,quality = line.split()
+
+ chr1 = int(chr1)
+ chr2 = int(chr2)
+
+ seq1=seq1.lower()
+ seq2=seq2.lower()
+
+ pos1 = int(pos1)
+ pos2 = int(pos2)
+
+ line_d = {'id':id, 'chr1':chr1, 'pos1':pos1, 'seq1':seq1, 'chr2':chr2,\
+ 'pos2':pos2, 'seq2':seq2, 'quality':'quality'}
+
return line_d
def next(self):
old_entry = entries[id]
old_entry.append(line_d)
entries[id] = old_entry
+
+ return entries
+
+
+class MapParser(ReadParser):
+ """
+ This class offers a parser for the reads that are remapped by the vmatch
+ utility.
+
+ According to the docu the entries are:
+
+ ID, Chromosome, Position, Orientation (D or P), Mismatches, Alignment length, Offset, Alignment
+
+ """
+
+ def __init__(self,filename):
+ ReadParser.__init__(self,filename)
+
+ def parseLine(self,line):
+ """
+ We assume that a line has the following entries:
+
+ """
+
+ id,chr,pos,strand,mismatches,length,offset,seq = line.split()
+
+ chr = int(chr)
+ pos = int(pos)
+
+ if strand == 'D':
+ strand = '+'
+
+ if strand == 'P':
+ strand = '-'
+
+ mismatches = int(mismatches)
+ length = int(length)
+ offset = int(offset)
+
+ seq=seq.lower()
+
+ line_d = {'id':id, 'chr':chr, 'pos':pos, 'strand':strand,\
+ 'mismatches':mismatches, 'length':length, 'offset':offset,\
+ 'seq':seq}
+
+ return line_d
+
+ def next(self):
+ for line in self.fh:
+ line = line.strip()
+ yield self.parseLine(line)
+
+ raise StopIteration
+
+ def parse(self):
+ entries = {}
+
+ for elem in self.fh:
+ line_d = self.parseLine(elem)
+ id = line_d['id']
+ try:
+ entries[id] = [line_d]
+ except:
+ old_entry = entries[id]
+ old_entry.append(line_d)
+ entries[id] = old_entry
+
+ return entries
import numpy.matlib
import QPalmaDP
import pdb
-import Configuration as Conf
from Plif import *
-def set_param_palma(param, train_with_intronlengthinformation,\
- min_intron_len=None, max_intron_len=None, min_svm_score=None, max_svm_score=None):
+import qpalma.Configuration as Conf
+
+def set_param_palma(param, train_with_intronlengthinformation, run):
print 'Setting parameters ...'
- min_intron_len = Conf.min_intron_len
- max_intron_len = Conf.max_intron_len
+ min_intron_len = run['min_intron_len']
+ max_intron_len = run['max_intron_len']
- min_svm_score = Conf.min_svm_score
- max_svm_score = Conf.max_svm_score
+ min_svm_score = run['min_svm_score']
+ max_svm_score = run['max_svm_score']
h = Plf()
d = Plf()
a = Plf()
- qualityPlifs = [None]*Conf.numQualPlifs
- donSP = Conf.numDonSuppPoints
- accSP = Conf.numAccSuppPoints
- lengthSP = Conf.numLengthSuppPoints
- mmatrixSP = Conf.sizeMatchmatrix[0]\
- *Conf.sizeMatchmatrix[1]
- totalQualSP = Conf.totalQualSuppPoints
+ qualityPlifs = [None]*run['numQualPlifs']
+ donSP = run['numDonSuppPoints']
+ accSP = run['numAccSuppPoints']
+ lengthSP = run['numLengthSuppPoints']
+ mmatrixSP = run['matchmatrixRows']*run['matchmatrixCols']
+ totalQualSP = run['totalQualSuppPoints']
####################
# Gapfunktion
# Matchmatrix
####################
mmatrix = numpy.matlib.mat(param[lengthSP+donSP+accSP:lengthSP+donSP+accSP+mmatrixSP])
- mmatrix.reshape(Conf.sizeMatchmatrix[0],Conf.sizeMatchmatrix[1])
+
+
+
+ mmatrix.reshape(run['matchmatrixRows'],run['matchmatrixCols'])
+
+
####################
# Quality Plifs
####################
- for idx in range(Conf.numQualPlifs):
+ for idx in range(run['numQualPlifs']):
currentPlif = Plf()
- currentPlif.limits = linspace(Conf.min_qual,Conf.max_qual,Conf.numQualSuppPoints)
- begin = lengthSP+donSP+accSP+mmatrixSP+(idx*Conf.numQualSuppPoints)
- end = lengthSP+donSP+accSP+mmatrixSP+((idx+1)*Conf.numQualSuppPoints)
+ currentPlif.limits = linspace(run['min_qual'],run['max_qual'],run['numQualSuppPoints'])
+ begin = lengthSP+donSP+accSP+mmatrixSP+(idx*run['numQualSuppPoints'])
+ end = lengthSP+donSP+accSP+mmatrixSP+((idx+1)*run['numQualSuppPoints'])
+
currentPlif.penalties = param[begin:end].flatten().tolist()[0]
currentPlif.transform = ''
currentPlif.name = 'q'
- currentPlif.max_len = Conf.max_qual
- currentPlif.min_len = Conf.min_qual
+
+ currentPlif.max_len = run['max_qual']
+ currentPlif.min_len = run['min_qual']
+
qualityPlifs[idx] = currentPlif
return h,d,a,mmatrix,qualityPlifs
import pdb
import os
import os.path
+import math
+
def evaluatePositions(eBegin,eEnd):
eBegin_pos = [elem for elem in eBegin if elem == 0]
def perform_prediction(current_dir,run_name):
- #cmd = 'echo ./doPrediction.sh %s | qsub -l h_vmem=1.0G -cwd -j y -N \"%s.log\"'%(current_dir,run_name)
- cmd = './doPrediction.sh %s 1>%s.out 2>%s.err' %(current_dir,run_name,run_name)
- os.system(cmd)
+ cmd = 'echo /fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/doPrediction.sh %s | qsub -l h_vmem=12.0G -cwd -j y -N \"%s.log\"'%(current_dir,run_name)
+ #cmd = './doPrediction.sh %s 1>%s.out 2>%s.err' %(current_dir,run_name,run_name)
#print cmd
-
+ os.system(cmd)
def forall_experiments(current_func,tl_dir):
"""
allWrongExons = []
allDoubleScores = []
- ctr = 0
+ all_pos_correct_ctr = 0
for current_example_pred in allPredictions:
- ambigous_match = False
- if len(current_example_pred) > 1:
- ambigous_match = True
- example_scores = []
+ for elem_nr,current_pred in enumerate(current_example_pred[:1]):
+
+ #gt_score = gt_example[]
+ #score = gt_example['DPScores'].flatten().tolist()[0][0]
+
+ #all_pos_correct = evaluate_example(current_pred)
+ all_pos_correct = evaluate_unmapped_example(current_pred)
+
+ if all_pos_correct:
+ all_pos_correct_ctr += 1
+
+ print 'Total num. of examples: %d' % len(allPredictions)
+ print 'Number of total correct examples: %d' % all_pos_correct_ctr
+
+ #print 'Correct positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_pos),len(e1End_pos),len(e2Begin_pos),len(e2End_pos))
+ #print 'Incorrect positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_neg),len(e1End_neg),len(e2Begin_neg),len(e2End_neg))
+ #print 'Mean of pos. offset:\t\t%.2f\t%.2f\t%.2f\t%.2f' % (mean_e1Begin_neg,mean_e1End_neg,mean_e2Begin_neg,mean_e2End_neg)
+
+def evaluate_unmapped_example(current_prediction):
+
+ predExons = current_prediction['predExons']
+ trueExons = current_prediction['trueExons']
+
+ return compare_exons(predExons,trueExons)
+
+
+def evaluate_example(current_prediction):
+
+ label = False
+
+ try:
+ label = current_prediction['label']
+ except:
+ pass
+
+ #predExons = current_prediction['predExons']
+ #trueExons = current_prediction['trueExons']
+ #pdb.set_trace()
+
+ if label:
+
+ start_pos = current_prediction['start_pos']
+ alternative_start_pos = current_prediction['alternative_start_pos']
+
+ predExons = current_prediction['predExons']
+ trueExons = current_prediction['trueExons']
+
+ #pdb.set_trace()
+
+ if start_pos <= alternative_start_pos:
+ start_offset = alternative_start_pos-start_pos
+ predExons += start_offset
+
+ return compare_exons(predExons,trueExons)
+
+ if alternative_start_pos <= start_pos:
+ start_offset = start_pos-alternative_start_pos
+ predExons += start_offset
+
+ return compare_exons(predExons,trueExons)
+
+ else:
+ pred_score = current_prediction['DPScores'].flatten().tolist()[0][0]
+
+ return False
+
+
+def compare_exons(predExons,trueExons):
+ e1_b_off,e1_e_off,e2_b_off,e2_e_off = 0,0,0,0
- for elem_nr,current_pred in enumerate(current_example_pred):
- e1_b_off = current_pred['e1_b_off']
- e1_e_off = current_pred['e1_e_off']
- e2_b_off = current_pred['e2_b_off']
- e2_e_off = current_pred['e2_e_off']
+ if len(predExons) == 4:
+ e1_begin,e1_end = predExons[0],predExons[1]
+ e2_begin,e2_end = predExons[2],predExons[3]
+ else:
+ return False
+ e1_b_off = int(math.fabs(e1_begin - trueExons[0,0]))
+ e1_e_off = int(math.fabs(e1_end - trueExons[0,1]))
+
+ e2_b_off = int(math.fabs(e2_begin - trueExons[1,0]))
+ e2_e_off = int(math.fabs(e2_end - trueExons[1,1]))
+
+ if e1_b_off == 0 and e1_e_off == 0 and e2_b_off == 0\
+ and e2_e_off == 0:
+ return True
+
+ return False
+
+
+def evaluateExample(dna,est,exons,SpliceAlign,newEstAlign):
+ newExons = []
+ oldElem = -1
+ SpliceAlign = SpliceAlign.flatten().tolist()[0]
+ SpliceAlign.append(-1)
+ for pos,elem in enumerate(SpliceAlign):
+ if pos == 0:
+ oldElem = -1
+ else:
+ oldElem = SpliceAlign[pos-1]
+
+ if oldElem != 0 and elem == 0: # start of exon
+ newExons.append(pos)
+
+ if oldElem == 0 and elem != 0: # end of exon
+ newExons.append(pos)
+
+ e1_b_off,e1_e_off,e2_b_off,e2_e_off = 0,0,0,0
+
+ if len(newExons) == 4:
+ e1_begin,e1_end = newExons[0],newExons[1]
+ e2_begin,e2_end = newExons[2],newExons[3]
+ else:
+ return None,None,None,None,newExons
+
+ e1_b_off = int(math.fabs(e1_begin - exons[0,0]))
+ e1_e_off = int(math.fabs(e1_end - exons[0,1]))
+
+ e2_b_off = int(math.fabs(e2_begin - exons[1,0]))
+ e2_e_off = int(math.fabs(e2_end - exons[1,1]))
+
+ return e1_b_off,e1_e_off,e2_b_off,e2_e_off,newExons
+
+
+if __name__ == '__main__':
+ dir = sys.argv[1]
+ assert os.path.exists(dir), 'Error directory does not exist!'
+
+ #forall_experiments(perform_prediction,dir)
+ forall_experiments(collect_prediction,dir)
+
+"""
if elem_nr > 0:
#print 'start positions'
#print current_pred['start_pos'], current_pred['alternative_start_pos']
if current_pred['label'] == False or (current_pred['label'] == True
and len(current_pred['predExons']) != 4):
- if current_pred['DPScores'].flatten().tolist()[0][0] <\
+ if
current_example_pred[0]['DPScores'].flatten().tolist()[0][0]:
print current_pred['trueExons'][0,1]-current_pred['trueExons'][0,0],\
- current_pred['trueExons'][1,1]-current_pred['trueExons'][1,0],\
+ current_pred['trueExons'][1,1] - current_pred['trueExons'][1,0],\
current_pred['predExons']
print current_pred['DPScores'].flatten().tolist()[0][0],\
current_example_pred[0]['DPScores'].flatten().tolist()[0][0]
e2Begin_pos,e2Begin_neg,e2End_pos,e2End_neg,mean_e2Begin_neg,mean_e2End_neg = evaluatePositions(exon2Begin,exon2End)
allDoubleScores.append(example_scores)
-
- all_pos_correct = 0
- for idx in range(len(exon1Begin)):
- if exon1Begin[idx] == 0 and exon1End[idx] == 0\
- and exon2Begin[idx] == 0 and exon2End[idx] == 0:
- all_pos_correct += 1
-
- print 'Total num. of examples: %d' % len(allPredictions)
- print 'Number of total correct examples: %d' % all_pos_correct
- print 'Correct positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_pos),len(e1End_pos),len(e2Begin_pos),len(e2End_pos))
- print 'Incorrect positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_neg),len(e1End_neg),len(e2Begin_neg),len(e2End_neg))
- print 'Mean of pos. offset:\t\t%.2f\t%.2f\t%.2f\t%.2f' % (mean_e1Begin_neg,mean_e1End_neg,mean_e2Begin_neg,mean_e2End_neg)
-
-
-if __name__ == '__main__':
- dir = sys.argv[1]
- assert os.path.exists(dir), 'Error directory does not exist!'
-
- #forall_experiments(perform_prediction,dir)
- forall_experiments(collect_prediction,dir)
+"""
numFolds=5
# the main directory where all results are stored
- experiment_dir = '/fml/ag-raetsch/home/fabio/tmp/QPalma_test'
+ experiment_dir = '/fml/ag-raetsch/home/fabio/tmp/QPalma_2k'
assert os.path.exists(experiment_dir), 'toplevel dir for experiment does not exist!'
allRuns = []
- dataset_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/dataset_remapped_test'
+ #dataset_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/dataset_remapped_test'
+ dataset_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/dataset_remapped_test_2k'
for QFlag in [True,False]:
for SSFlag in [True,False]:
- for ILFlag in [True,False]:
+ for ILFlag in [True]:
+ #for ILFlag in [True,False]:
# create a new Run object
currentRun = Run()
currentRun['print_matrix'] = Conf.print_matrix
currentRun['read_size'] = Conf.read_size
- currentRun['numLengthSuppPoints'] = 2 #Conf.numLengthSuppPoints
+ currentRun['numLengthSuppPoints'] = 10 #Conf.numLengthSuppPoints
currentRun['numDonSuppPoints'] = 10
currentRun['numAccSuppPoints'] = 10
def doSelection(self):
for instance in self.allInstances:
- time.sleep(3)
+ time.sleep(2)
os.system('echo ./resurrect %s | qsub -l h_vmem=3.0G -cwd -j y -N \"%s.log\"'%(instance,instance))
#os.system('echo "./resurrect %s >out_%s.log 2>err_%s.log &"'%(instance,instance,instance))
import os
import re
import pdb
-import pydb
+#import pydb
import io_pickle
import numpy
if instance_counter % 1000 == 0:
print 'processed %d examples' % instance_counter
- if instance_counter == 2000:
+ if instance_counter == 20000:
break
print 'Full dataset has size %d' % len(Sequences)
#'Ids':Ids, 'FReads':FReads}
# saving new dataset
- io_pickle.save(dataset_file,dataset)
+ #io_pickle.save(dataset_file,dataset)
+
+ #io_pickle.save(dataset_file,dataset)
+ cPickle.dump(dataset,open(dataset_file,'w+'),protocol=2)
def process_filtered_read(fRead,currentGene,dna_flat_files,test):
# now fetch acceptor and donor scores:
- sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/acc/output_SVMWD/pred/contig_%d%s'\
- % (chr,strand)
+ #sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/acc/output_SVMWD/pred/contig_%d%s'\
+ #% (chr,strand)
+
+ sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s' % (chr,strand)
# fetch acceptor scores
result = interval_query(sscore_filename, ['Conf'], 1, interval_matrix, 1,pos_size)
acc = acc[1:] + [-inf]
# fetch donor scores
- sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_%d%s'\
- % (chr,strand)
+ #sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_%d%s'\
+ #% (chr,strand)
+
+ sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s' % (chr,strand)
result = interval_query(sscore_filename, ['Conf'], 1, interval_matrix, 1,pos_size)
# middle part of the read matches somewhere
# search up/downstream for the rest
else:
- genomicSeq_start = pos
- genomicSeq_stop = pos + fragment_size
+ genomicSeq_start = pos - (fragment_size/2)
+ genomicSeq_stop = pos + (fragment_size/2)
alternativeSeq.append( (chr,strand,genomicSeq_start,genomicSeq_stop,currentLabel) )
- genomicSeq_start = pos - fragment_size
- genomicSeq_stop = pos+length
- alternativeSeq.append( (chr,strand,genomicSeq_start,genomicSeq_stop,currentLabel) )
+ #genomicSeq_start = pos - fragment_size
+ #genomicSeq_stop = pos+36
+ #alternativeSeq.append( (chr,strand,genomicSeq_start,genomicSeq_stop,currentLabel) )
continue
return alternativeSeq
import pdb
import re
import os.path
-import pydb
+#import pydb
from compile_dataset import getSpliceScores
else:
assert(False)
- full_working_path = os.path.join(run['experiment_path'],run['name'])
- #assert not os.path.exists(full_working_path)
- #os.mkdir(full_working_path)
- assert os.path.exists(full_working_path)
-
- # ATTENTION: Changing working directory
- os.chdir(full_working_path)
-
- cPickle.dump(run,open('run_object.pickle','w+'))
def plog(self,string):
self.logfh.write(string)
def train(self):
run = self.run
+ full_working_path = os.path.join(run['experiment_path'],run['name'])
+
+ assert not os.path.exists(full_working_path)
+ os.mkdir(full_working_path)
+
+ assert os.path.exists(full_working_path)
+
+ # ATTENTION: Changing working directory
+ os.chdir(full_working_path)
+
+ cPickle.dump(run,open('run_object.pickle','w+'))
+
self.logfh = open('_qpalma_train.log','w+')
self.plog("Settings are:\n")
_newSpliceAlign = newSpliceAlign[0].flatten().tolist()[0]
_newEstAlign = newEstAlign[0].flatten().tolist()[0]
+
line1,line2,line3 = pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array)
self.plog(line1+'\n')
self.plog(line2+'\n')
self.logfh = open('_qpalma_predict.log','w+')
# predict on training set
- print '##### Prediction on the training set #####'
+ self.plog('##### Prediction on the training set #####\n')
+
self.predict(param_filename,0,beg,'TRAIN')
# predict on test set
- print '##### Prediction on the test set #####'
+ self.plog('##### Prediction on the test set #####\n')
self.predict(param_filename,beg,end,'TEST')
+ self.plog('##### Finished prediction #####\n')
self.logfh.close()
- #pdb.set_trace()
def predict(self,param_filename,beg,end,set_flag):
"""
AlternativeSequences = self.AlternativeSequences[beg:end]
- print 'STARTING PREDICTION'
-
# number of training instances
N = numExamples = len(Sequences)
assert len(Exons) == N and len(Ests) == N\
# beginning of the prediction loop
for exampleIdx in range(numExamples):
+ self.plog('Loading example nr. %d...\n'%exampleIdx)
+
dna = Sequences[exampleIdx]
est = Ests[exampleIdx]
# first make a prediction on the dna fragment which comes from the ground truth
current_prediction = self.calc_alignment(dna, est, exons, quality, don_supp, acc_supp, d, a, h, mmatrix, qualityPlifs)
current_prediction['exampleIdx'] = exampleIdx
- current_prediction['start_pos'] = current_start_pos
+ current_prediction['start_pos'] = current_start_pos
+ current_prediction['label'] = True
current_example_predictions.append(current_prediction)
chr, strand, genomicSeq_start, genomicSeq_stop, currentLabel = alternative_alignment
currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
+ if not run['enable_splice_signals']:
+ for idx,elem in enumerate(currentDon):
+ if elem != -inf:
+ currentDon[idx] = 0.0
+
+ for idx,elem in enumerate(currentAcc):
+ if elem != -inf:
+ currentAcc[idx] = 0.0
+
current_prediction = self.calc_alignment(currentDNASeq, est, exons,\
quality, currentDon, currentAcc, d, a, h, mmatrix, qualityPlifs)
current_prediction['exampleIdx'] = exampleIdx
_newSpliceAlign = newSpliceAlign.flatten().tolist()[0]
_newEstAlign = newEstAlign.flatten().tolist()[0]
+
+ if False:
+ line1,line2,line3 = pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array)
+ self.plog(line1+'\n')
+ self.plog(line2+'\n')
+ self.plog(line3+'\n')
- line1,line2,line3 = pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array)
- self.plog(line1+'\n')
- self.plog(line2+'\n')
- self.plog(line3+'\n')
-
- e1_b_off,e1_e_off,e2_b_off,e2_e_off,newExons = self.evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign)
+ newExons = self.calculatePredictedExons(newSpliceAlign)
- current_prediction = {'e1_b_off':e1_b_off,'e1_e_off':e1_e_off,\
- 'e2_b_off':e2_b_off ,'e2_e_off':e2_e_off,\
- 'predExons':newExons, 'trueExons':exons,\
- 'dna':dna, 'est':est, 'DPScores':newDPScores }
+ current_prediction = {'predExons':newExons, 'trueExons':exons,\
+ 'dna':dna, 'est':est, 'DPScores':newDPScores}
return current_prediction
- def evaluateExample(self,dna,est,exons,SpliceAlign,newEstAlign):
+ def calculatePredictedExons(self,SpliceAlign):
newExons = []
oldElem = -1
SpliceAlign = SpliceAlign.flatten().tolist()[0]
if oldElem == 0 and elem != 0: # end of exon
newExons.append(pos)
- e1_b_off,e1_e_off,e2_b_off,e2_e_off = 0,0,0,0
-
- #self.plog(("Example %d"%exampleIdx) + str(exons) + " "+ str(newExons) + "\n")
- #self.plog(("SpliceAlign " + str(SpliceAlign)+ "\n"))
- self.plog(("read: " + str(est)+ "\n"))
-
- if len(newExons) == 4:
- e1_begin,e1_end = newExons[0],newExons[1]
- e2_begin,e2_end = newExons[2],newExons[3]
- else:
- return None,None,None,None,newExons
-
- e1_b_off = int(math.fabs(e1_begin - exons[0,0]))
- e1_e_off = int(math.fabs(e1_end - exons[0,1]))
-
- e2_b_off = int(math.fabs(e2_begin - exons[1,0]))
- e2_e_off = int(math.fabs(e2_end - exons[1,1]))
-
- return e1_b_off,e1_e_off,e2_b_off,e2_e_off,newExons
+ return newExons
def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files):
all: $(OBJS)
@ $(CC) $(CFLAGS) -o filterReads $(OBJS) filterReads.c
+test:
+ @ ./filterReads subset.gff subset.reads output 1
+
clean:
@ rm -rf *.o
int combined_reads = 0;
+
int main(int argc, char* argv[]) {
if(argc != 5) {
FILE *reads_fs = fopen(reads_filename,"r");
FILE *out_fs = fopen(output_filename,"w");
- //read_nr = atoi(argv[4]);
read_nr = strtoul(argv[4],NULL,10);
read_nr++;
if(up_range+down_range < read_size)
return 0;
- int half_read = read_size / 2;
-
// both overlap more than a half of the new read
- if(up_range >= half_read && down_range >= half_read) {
+ //if(up_range >= half_read && down_range >= half_read) {
- u_size = half_read;
- d_size = half_read;
+ // u_size = half_read;
+ // d_size = half_read;
- } else {
+ //} else {
//if(up_range < down_range) {
// u_size = up_range;
// u_size = read_size - d_size;
//}
- if(up_range < down_range) {
+ //if(up_range < down_range) {
+ if (read_nr % 2 != 0) {
d_size = down_range;
u_size = read_size - d_size;
}
- if(up_range > down_range) {
+ //if(up_range >= down_range) {
+ if (read_nr % 2 == 0) {
u_size = up_range;
d_size = read_size - u_size;
}
- }
+ //}
int p_start = exon_stop - u_size + 1;
int p_stop = exon_start + d_size - 1;
//fprintf(out_fs,"%d\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%s\t%s\n",
// read_nr,up_read->chr,up_read->strand,new_seq,cut_pos,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop,up_read->seq,down_read->seq);
- fprintf(out_fs,"%lu\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\n",
- read_nr,up_read->chr,up_read->strand,new_seq,cut_pos,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop);
+ fprintf(out_fs,"%lu\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\n",
+ read_nr,up_read->chr,up_read->strand,new_seq,cut_pos,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop,u_size);
read_nr++;
combined_reads++;
"""
assert(Configuration.mode == 'using_quality_scores')
param = cPickle.load(open(filename))
- [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,train_with_intronlengthinformation)
+ [h,d,a,mmatrix,qualityPlifs] =\
+ set_param_palma(param,train_with_intronlengthinformation,None)
# construct the substitution matrix using the last value of qualityPlif
substmat = numpy.matlib.zeros((1,(Configuration.estPlifs+1)*Configuration.dnaPlifs))
pylab.yticks( numpy.arange(5.5,0,-1), ('-', 'A', 'C', 'G', 'T', 'N') )
pylab.xlabel('DNA',fontsize=20)
pylab.ylabel('EST',fontsize=20)
+
+ pylab.savefig('matrix.eps')
+ pylab.clf()
# plot donor and acceptor transformations
pylab.figure()
pylab.ylabel('transition score',fontsize=20)
pylab.legend()
+ pylab.savefig('donacc.eps')
+ pylab.clf()
+
# plot intron length transformations
pylab.figure()
pylab.plot(h.limits,h.penalties,'k+-')
pylab.xlabel('intron length',fontsize=20)
pylab.ylabel('transition score',fontsize=20)
+ pylab.savefig('intron_len.eps')
+ pylab.clf()
+
# plot quality score transformations
pylab.figure()
for pos,plif in enumerate(qualityPlifs):
- #if pos in [1,8,15,22]:
- # pylab.plot(plif.limits,plif.penalties,'b^-',hold=True)
+ if pos in [1,8,15,22,7,13,19]:
+ #if pos in [3,9,21]:
+ pylab.plot(plif.limits,plif.penalties,'b^-',hold=True)
- pylab.plot(plif.limits,plif.penalties,'b^-',hold=True)
+ #pylab.plot(plif.limits,plif.penalties,'b^-',hold=True)
pylab.xlabel('quality value',fontsize=20)
pylab.ylabel('transition score',fontsize=20)
- pylab.show()
+ pylab.savefig('quality.eps')
+ pylab.clf()
+
+ #pylab.show()
if __name__ == '__main__':