IF=QPalmaDP
all: $(OBJS) $(HDRS)
- swig -c++ -python ${IF}.i
- g++ $(CXXFLAGS) -I/usr/include/python2.5 -c ${IF}_wrap.cxx -o ${IF}_wrap.o
- g++ $(CXXFLAGS) -shared -lpython2.5 $(OBJS) ${IF}_wrap.o -o _${IF}.so
- python -c "import ${IF}"
+ @ swig -c++ -python ${IF}.i
+ @ g++ $(CXXFLAGS) -I/usr/include/python2.5 -c ${IF}_wrap.cxx -o ${IF}_wrap.o
+ @ g++ $(CXXFLAGS) -shared -lpython2.5 $(OBJS) ${IF}_wrap.o -o _${IF}.so
+ @ python -c "import ${IF}"
clean:
- rm *.o *.so *.cxx ${IF}.py ${IF}.pyc
+ @ rm *.o *.so *.cxx ${IF}.py ${IF}.pyc
tempValue = prevValue + matchmatrix[mlen*dnaChar]; /* score(gap,DNA) */
}
+ //printf("tempValue: %f\n",tempValue);
if (isfinite(tempValue))
{
tempValue = prevValue +(matchmatrix[mlen* dnaChar +estChar]);
}
+ //printf("tempValue: %f\n",tempValue);
+
if (isfinite(tempValue))
{
assert(num_elem<max_num_elem) ;
tempValue = prevValue +(matchmatrix[estChar]); /* score(EST,gap) */
}
+ //printf("tempValue: %f\n",tempValue);
+
if (isfinite(tempValue))
{
assert(num_elem<max_num_elem) ;
int dist = j -(splice_pos-1); //_gt...ag: dist from g to g
assert(dist>max_len) ;
tempValue = lookup_penalty(functions,dist, NULL, 0) +donor[splice_pos-1] +acceptor[j-1] +prevValue;
+
+ //printf("tempValue: %f\n",tempValue);
+
if (isfinite(tempValue))
{
assert(num_elem<max_num_elem) ;
+
#include "qpalma_dp.h"
#include <cstring>
est_align = 0;
mmatrix_param = 0;
alignmentscores = 0;
- qualityScoresAllPaths = 0;
+ qualityFeaturesAllPaths = 0;
mlen = 6; // score matrix: length of 6 for "- A C G T N"
numQualSuppPoints = numq;
// dnaest
DNA_ARRAY = 0;
EST_ARRAY = 0;
+
+ //printf("[qpalma_dp] read is: %s\n",est);
//possible donor positions
int nr_donor_sites = 0 ;
// printf("%f ",pidx,p.penalties[pidx]);
// printf("\n");
//}
- //
//int h_idx;
//for(h_idx=0;h_idx<h.len;h_idx++)
memset(alignmentscores, -1, nr_paths*sizeof(double)); //fills alignmentscores with zeros
//printf("after memset...\n");
- qualityScoresAllPaths= new penalty_struct*[nr_paths];
+ qualityFeaturesAllPaths= new penalty_struct*[nr_paths];
for (int z=0; z<nr_paths; z++) {
result_length = 0 ;
int* e_align = est_align + (est_len-1)*z ; //pointer
int* mparam = mmatrix_param + mmatrix_size*z; //pointer
- qualityScoresAllPaths[z] = new penalty_struct[numPlifs];
+ qualityFeaturesAllPaths[z] = new penalty_struct[numPlifs];
int qidx, pidx;
for(qidx=0;qidx<numPlifs;qidx++) {
penalty_struct p;
+ init_penalty_struct(p);
p.len = numQualSuppPoints;
- p.limits = (REAL*) calloc(p.len,sizeof(REAL));
+ p.limits = (double*) calloc(p.len,sizeof(double));
for(pidx=0;pidx<p.len;pidx++)
- p.limits[pidx] = qualityScores[qidx%numPlifs].limits[pidx];
+ p.limits[pidx] = qualityScores[qidx].limits[pidx];
- p.penalties = (REAL*) calloc(p.len,sizeof(REAL));
- qualityScoresAllPaths[z][qidx] = p;
+ p.penalties = (double*) calloc(p.len,sizeof(double));
+ qualityFeaturesAllPaths[z][qidx] = p;
}
//penalty_struct* qparam = qualityScoresAllPaths + (numPlifs*z);
//printf("before call to result_align...\n");
- bool no_more_path = result_align(matrices, z, est_len, dna_len, &result_length, est, dna, prb, chastity, s_align, e_align, mparam, alignmentscores, max_score_positions, qualityScoresAllPaths[z] , currentMode);
+ bool no_more_path = result_align(matrices, z, est_len, dna_len, &result_length, est, dna, prb, chastity, s_align, e_align, mparam, alignmentscores, max_score_positions, qualityFeaturesAllPaths[z] , currentMode);
//printf("after call to result_align...\n");
//printf("z is %d\n",z);
+ //int len;
//for(qidx=0;qidx<numPlifs;qidx++) {
// penalty_struct p;
- // p = qualityScoresAllPaths[qidx+numPlifs*z];
- // printf("%d ",qidx);
+ // p = qualityScoresAllPaths[z][qidx];
+ // printf("%d: ",qidx);
// for(pidx=0;pidx<p.len;pidx++)
- // printf("%f ",pidx,p.penalties[pidx]);
+ // printf("%f ",p.limits[pidx]);
+ // printf("\n");
+
+ // for(pidx=0;pidx<p.len;pidx++)
+ // printf("%f ",p.penalties[pidx]);
// printf("\n");
//}
- //
+
if(z==0) {
if(DNA_ARRAY != 0) {
for (int z=0; z<nr_paths; z++) {
for(int estChar=1;estChar<6;estChar++) {
for(int dnaChar=0;dnaChar<6;dnaChar++) {
+
int currentPos = (estChar-1)*6+dnaChar;
- currentPlif = qualityScoresAllPaths[z][currentPos];
+ currentPlif = qualityFeaturesAllPaths[z][currentPos];
+
for(int pidx=0; pidx<currentPlif.len; pidx++) {
qScores[ctr] = currentPlif.penalties[pidx];
+ //printf("%f ",qScores[ctr]);
ctr++;
}
+ //printf("\n");
}}}
- }
- //for(idx=0; idx<numPathsPlifs; idx++) {
- // currentPlif = qualityScoresAllPaths[idx];
- // //printf("Size is %dx%d\n",numPathsPlifs,currentPlif.len);
- // for(pidx=0; pidx<currentPlif.len; pidx++) {
- // qScores[ctr] = currentPlif.penalties[pidx];
- // ctr++;
- // }
- //}
-
- //printf("\nctr is %d\n",ctr);
+ //printf("\nctr is %d\n",ctr);
+ }
//printf("Leaving getAlignmentResults...\n");
}
int* est_align;
int* mmatrix_param;
double* alignmentscores;
- struct penalty_struct** qualityScoresAllPaths;
+ struct penalty_struct** qualityFeaturesAllPaths;
int dna_len;
int est_len;
if(alignmentscores != 0)
delete[] alignmentscores;
- if(qualityScoresAllPaths != 0)
- delete[] qualityScoresAllPaths;
+ if(qualityFeaturesAllPaths != 0)
+ delete[] qualityFeaturesAllPaths;
}
void myalign(int nr_paths_p, char* dna, int dna_len_p, char* est,
# The parameters for the QPalma algorithm
#
#
-C = 1000
+C = 100
min_intron_len = 20
max_intron_len = 2000
min_svm_score = 0.0
max_svm_score = 1.0
-numConstraintsPerRound = 100
+numConstraintsPerRound = 1
###############################################################################
#
#numAccSuppPoints = 30
#numQualSuppPoints = 16
-numLengthSuppPoints = 20
-numDonSuppPoints = 20
-numAccSuppPoints = 20
-numQualSuppPoints = 8
+#numLengthSuppPoints = 20
+#numDonSuppPoints = 20
+#numAccSuppPoints = 20
+#numQualSuppPoints = 8
+
+numLengthSuppPoints = 2
+numDonSuppPoints = 2
+numAccSuppPoints = 2
+numQualSuppPoints = 2
min_qual = -1
max_qual = 40
anzpath = 2
if mode == 'normal':
- fixedParam = fixedParam[:numFeatures]
+ fixedParam = fixedParamQ[:numFeatures]
elif mode == 'using_quality_scores':
fixedParam = fixedParamQ[:numFeatures]
else:
###############################################################################
training_begin = 0
-training_end = 2000
+training_end = 2
-prediction_begin = 2000
-prediction_end = 3500
+prediction_begin = 10
+prediction_end = 20
joinp = os.path.join
dna_flat_fn = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
gff_fn = joinp(annot_path,'TAIR7_GFF3_genes_Chr%s.gff_v1')
-filtered_fn = joinp(data_path,'allFilteredReads_04_02_2008')
+filtered_fn = joinp(data_path,'allFilteredReads_07_02_2008')
remapped_fn = joinp(remapped_path,'map_best_hit.18.unambig')
dataset_fn = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/chr1_dataset.pickle'
def paths_load_data(filename,expt,genome_info,PAR,test=False):
- data = io_pickle.load(filename)
+ #data = io_pickle.load(filename)
+
+ #cPickle.dump(data,open(filename+'.alt','w+'))
+
+ data = cPickle.load(open(filename+'.alt'))
Sequences = data['Sequences']
Acceptors = data['Acceptors']
return Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPositions
+
def paths_load_data_solexa(expt,genome_info,PAR,test=False):
"""
from numpy.matlib import mat,zeros,ones,inf
import pdb
from Plif import Plf,base_coord,linspace
-import Configuration
+import Configuration as Conf
-def computeSpliceAlignWithQuality(dna, exons):
+def computeSpliceAlignWithQuality(dna, exons, est=None, quality=None,
+qualityPlifs=None):
"""
Exonpos: Anfang Exon bis 1 rechts neben Ende Exon
Daraus folgt: SpliceposAnfang ist gleich Ende Exon von Vorgaenger
assert totalESTLength == len(est)
- sizeMatchmatrix = Configuration.sizeMatchmatrix
+ sizeMatchmatrix = Conf.sizeMatchmatrix
# counts the occurences of a,c,g,t,n in this order
numChar = [0]*sizeMatchmatrix[0]
# counts the occurrences of a,c,g,t,n with their quality scores
- #trueWeightQuality = zeros((Configuration.numQualPlifs*Configuration.numQualSuppPoints,1))
- trueWeightQuality = numpy.zeros(Configuration.numQualPlifs*Configuration.numQualSuppPoints)
- trueWeightQuality = trueWeightQuality.reshape(Configuration.estPlifs,Configuration.dnaPlifs,Configuration.numQualSuppPoints)
+ trueWeightQualityMat = [0.0]*5 # 5 rows
+ for row in range(5):
+ trueWeightQualityMat[row] = [0.0]*6 # 6 col per row
for elem in est:
if elem == 'a':
if elem == 'n':
numChar[4] += 1
- if Configuration.mode == 'using_quality_scores':
- for pos,elem in enumerate(numChar):
- if pos < Configuration.estPlifs:
- trueWeightQuality[pos][pos+1][-1] = elem
+ for row in range(5):
+ for col in range(6):
+ trueWeightQualityMat[row][col] = [0.0]*Conf.numQualSuppPoints # supp. points per plif
- trueWeightQuality = mat(trueWeightQuality.flatten())
- trueWeightQuality = trueWeightQuality.T
+ first_exon = dna[exons[0,0]:exons[0,1]]
+ second_exon = dna[exons[1,0]:exons[1,1]]
+ dna_part = first_exon + second_exon
+
+ assert len(dna_part) == len(est)
+
+ map = {'-':0, 'a':1, 'c':2, 'g':3, 't':4, 'n':5}
+
+ if Conf.mode == 'using_quality_scores':
+
+ for idx in range(len(dna_part)):
+ dnanum = map[dna_part[idx]]
+ estnum = map[est[idx]]
+ value = quality[idx]
+
+ cur_plf = qualityPlifs[(estnum-1)*6+dnanum]
+
+ Lower = len([elem for elem in cur_plf.limits if elem <= value])
+
+ if Lower == 0:
+ trueWeightQualityMat[estnum-1][dnanum][0] += 1
+ elif Lower == len(cur_plf.limits):
+ trueWeightQualityMat[estnum-1][dnanum][-1] += 1
+ else:
+ # because we count from 0 in python
+ Lower -= 1
+ Upper = Lower+1 ; # x-werte bleiben fest
+ #print value,Lower,Upper
+ weightup = 1.0*(value - cur_plf.limits[Lower]) / (cur_plf.limits[Upper] - cur_plf.limits[Lower])
+ weightlow = 1.0*(cur_plf.limits[Upper] - value) / (cur_plf.limits[Upper] - cur_plf.limits[Lower])
+
+ trueWeightQualityMat[estnum-1][dnanum][Upper] += weightup
+ trueWeightQualityMat[estnum-1][dnanum][Lower] += weightlow
+
+ trueWeightQuality = zeros((Conf.numQualSuppPoints*Conf.numQualPlifs,1))
+
+ ctr = 0
+ for row in range(5):
+ for col in range(6):
+ for p_idx in range(Conf.numQualSuppPoints):
+ #print ctr, row, col, p_idx
+ trueWeightQuality[ctr] = trueWeightQualityMat[row][col][p_idx]
+ ctr += 1
+
+ assert int(sum(trueWeightQuality.flatten().tolist()[0])) == len(est)
totalNumChar = 0
for idx in range(sizeMatchmatrix[0]):
# writing in weight match matrix
# matrix is saved columnwise
- if Configuration.mode == 'normal':
+ if Conf.mode == 'normal':
trueWeightMatch = zeros((sizeMatchmatrix[0],sizeMatchmatrix[1])) # Scorematrix fuer Wahrheit
for idx in range(1,sizeMatchmatrix[0]):
trueWeightMatch[idx,idx] = numChar[idx-1]
- if Configuration.mode == 'using_quality_scores':
+ if Conf.mode == 'using_quality_scores':
trueWeightMatch = zeros((sizeMatchmatrix[0],sizeMatchmatrix[1])) # Scorematrix fuer Wahrheit
#for idx in range(1,sizeMatchmatrix[0]):
# trueWeightMatch[idx] = numChar[idx-1]
# Picke die Positionen raus, an denen eine Donorstelle ist
try:
- #if dec:
DonorScores = [elem for pos,elem in enumerate(don_supp) if SpliceAlign[pos] == 1]
- #else:
- # DonorScores = [elem for pos,elem in enumerate(don_supp) if pos > 0 and SpliceAlign[pos-1] == 1]
+
assert not ( -inf in DonorScores )
except:
print 'Error'
if currentGene != None:
allGenes[currentGene.id] = currentGene
- return allGenes,allGenesA
+ return allGenes# ,allGenesA
def createGffPickle(annotFile,pickleFile):
# Iterate over all remapped reads in order to generate for each read an
# training / prediction example
+ instance_counter = 0
+
for id,currentFRead in all_filtered_reads.items():
try:
except:
currentRReads = None
- gene_id = currentFRead['gene_id']
- chr = currentFRead['chr']
- currentGene = allGenes[chr][gene_id]
+ try:
+ gene_id = currentFRead['gene_id']
+ chro = currentFRead['chr']
+ currentGene = allGenes[chro][gene_id]
+ except:
+ pdb.set_trace()
if currentFRead['strand'] != '+':
continue
SplitPositions.append(spos)
Ids.append(id)
+ instance_counter += 1
+
+ if instance_counter % 100 == 0:
+ print 'processed %d examples' % instance_counter
+
+ if instance_counter == 1000:
+ break
+
dataset = {'Sequences':Sequences, 'Acceptors':Acceptors, 'Donors':Donors,\
'Exons':Exons, 'Ests':Ests, 'Qualities':Qualities,\
'SplitPositions':SplitPositions,'Ids':Ids}
spos = fRead['splitpos']
assert p_start < exon_stop < exon_start < p_stop, 'Invalid Indices'
- assert exon_stop - p_start + p_stop - exon_start == 36, 'Invalid read size'
+ assert (exon_stop - p_start) + (p_stop - exon_start) + 2 == 36, 'Invalid read size'
assert p_stop - p_start >= 36
currentExons = zeros((2,2),dtype=numpy.int)
acc = [-inf] + acc[:-1]
+ currentExons[1,1] += 1
+
return currentSeq, acc, don, currentExons, currentReadSeq, quality, spos
#compile_d(Conf.gff_fn,Conf.dna_flat_fn,Conf.filtered_fn,Conf.remapped_fn,Conf.tmp_dir,Conf.dataset_fn,False)
#compile_d(Conf.gff_fn,Conf.dna_flat_fn,Conf.filtered_fn,Conf.remapped_fn,Conf.tmp_dir,Conf.dataset_fn,True)
- #gen_file= '%s/genome.config' % self.ARGS.basedir
- #ginfo_filename = 'genome_info.pickle'
- #self.genome_info = fetch_genome_info(ginfo_filename)
- #self.plog('genome_info.basedir is %s\n'%self.genome_info.basedir)
- #self.ARGS.train_with_splicesitescoreinformation = False
+ data_filename = Conf.dataset_fn
+ Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos = paths_load_data(data_filename,'training',None,self.ARGS)
# Load the whole dataset
if Conf.mode == 'normal':
- #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
- Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
self.use_quality_scores = False
elif Conf.mode == 'using_quality_scores':
-
- #Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos = paths_load_data_solexa('training',None,self.ARGS)
- data_filename = Conf.dataset_fn
-
- Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos = paths_load_data(data_filename,'training',None,self.ARGS)
-
- #end = 1000
- self.Sequences = Sequences
- self.Exons = Exons
- self.Ests = Ests
- self.Qualities = Qualities
- self.SplitPos = SplitPos
- self.Donors = Donors
- self.Acceptors = Acceptors
-
- min_score = 100
- max_score = -100
-
- for elem in self.Acceptors:
- for score in elem:
- if score != -inf:
- min_score = min(min_score,score)
- max_score = max(max_score,score)
-
- print 'Acceptors max/min: %f / %f' % (max_score,min_score)
-
- min_score = 100
- max_score = -100
-
- for elem in self.Donors:
- for score in elem:
- if score != -inf:
- min_score = min(min_score,score)
- max_score = max(max_score,score)
-
- print 'Donor max/min: %f / %f' % (max_score,min_score)
-
- min_score = 10000
- max_score = 0
- mean_intron_len = 0
-
- for elem in self.Exons:
- intron_len = int(elem[1,0] - elem[0,1])
- mean_intron_len += intron_len
- min_score = min(min_score,intron_len)
- max_score = max(max_score,intron_len)
-
- mean_intron_len /= 1.0*len(self.Exons)
- print 'Intron length max/min: %d / %d mean length %f' % (max_score,min_score,mean_intron_len)
-
-
self.use_quality_scores = True
else:
assert(False)
+ self.Sequences = Sequences
+ self.Exons = Exons
+ self.Ests = Ests
+ self.Qualities = Qualities
+ self.SplitPos = SplitPos
+ self.Donors = Donors
+ self.Acceptors = Acceptors
+
+ calc_info(self.Acceptors,self.Donors,self.Exons)
print 'leaving constructor...'
def plog(self,string):
# Initialize parameter vector / param = numpy.matlib.rand(126,1)
param = Conf.fixedParam
- # Set the parameters such as limits penalties for the Plifs
- [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
-
- #pdb.set_trace()
-
lengthSP = Conf.numLengthSuppPoints
donSP = Conf.numDonSuppPoints
accSP = Conf.numAccSuppPoints
numq = Conf.numQualSuppPoints
totalQualSP = Conf.totalQualSuppPoints
+ i1 = range(0,lengthSP)
+ i2 = range(lengthSP,lengthSP+donSP)
+ i3 = range(lengthSP+donSP,lengthSP+donSP+accSP)
+ i4 = range(lengthSP+donSP+accSP,lengthSP+donSP+accSP+mmatrixSP)
+ i5 = range(lengthSP+donSP+accSP+mmatrixSP,lengthSP+donSP+accSP+mmatrixSP+totalQualSP)
+ intervals = [] #i5,\ #i2,#i3,#i4,#i5]
+ zero_out(param,intervals)
+ pdb.set_trace()
+
+ # Set the parameters such as limits penalties for the Plifs
+ [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
+
# Initialize solver
if Conf.USE_OPT:
self.plog('Initializing problem...\n')
solver = SIQPSolver(Conf.numFeatures,numExamples,Conf.C,self.logfh)
- solver.enforceMonotonicity(lengthSP,lengthSP+donSP)
- solver.enforceMonotonicity(lengthSP+donSP,lengthSP+donSP+accSP)
+ #solver.enforceMonotonicity(lengthSP,lengthSP+donSP)
+ #solver.enforceMonotonicity(lengthSP+donSP,lengthSP+donSP+accSP)
# stores the number of alignments done for each example (best path, second-best path etc.)
num_path = [anzpath]*numExamples
# stores the gap for each example
gap = [0.0]*numExamples
-
#############################################################################################
# Training
#############################################################################################
currentPhi = zeros((Conf.numFeatures,1))
totalQualityPenalties = zeros((totalQualSP,1))
+ suboptimal_example = 0
iteration_nr = 0
param_idx = 0
const_added_ctr = 0
+
+ # the main training loop
while True:
if iteration_nr == iteration_steps:
break
dna = Sequences[exampleIdx]
est = Ests[exampleIdx]
+ est = "".join(est)
if Conf.mode == 'normal':
quality = [40]*len(est)
quality = Qualities[exampleIdx]
exons = Exons[exampleIdx]
- # NoiseMatrix = Noises[exampleIdx]
don_supp = Donors[exampleIdx]
acc_supp = Acceptors[exampleIdx]
# Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
- trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
+ if Conf.mode == 'using_quality_scores':
+ trueSpliceAlign, trueWeightMatch, trueWeightQuality =\
+ computeSpliceAlignWithQuality(dna, exons, est, quality, qualityPlifs)
+ else:
+ trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
# Calculate the weights
trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
- #idx_lst = [p for p,e in enumerate(trueWeightIntron) if e > 1e-5]
- #print idx_lst
- #print [h.limits[p] for p in idx_lst]
-
- currentPhi[0:donSP] = mat(d.penalties[:]).reshape(donSP,1)
- currentPhi[donSP:donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
- currentPhi[donSP+accSP:donSP+accSP+lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
- currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP] = mmatrix[:]
+ currentPhi[0:lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
+ currentPhi[lengthSP:lengthSP+donSP] = mat(d.penalties[:]).reshape(donSP,1)
+ currentPhi[lengthSP+donSP:lengthSP+donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
+ currentPhi[lengthSP+donSP+accSP:lengthSP+donSP+accSP+mmatrixSP] = mmatrix[:]
if Conf.mode == 'using_quality_scores':
totalQualityPenalties = param[-totalQualSP:]
- currentPhi[donSP+accSP+lengthSP+mmatrixSP:] = totalQualityPenalties[:]
+ currentPhi[lengthSP+donSP+accSP+mmatrixSP:] = totalQualityPenalties[:]
+
+ zero_out(currentPhi,intervals)
+ zero_out(trueWeight,intervals)
+
+ #pdb.set_trace()
# Calculate w'phi(x,y) the total score of the alignment
trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
AlignmentScores = [0.0]*(num_path[exampleIdx]+1)
AlignmentScores[0] = trueAlignmentScore
-
################## Calculate wrong alignment(s) ######################
# Compute donor, acceptor with penalty_lookup_new
# Create the alignment object representing the interface to the C/C++ code.
currentAlignment = QPalmaDP.Alignment(Conf.numQualPlifs,Conf.numQualSuppPoints, self.use_quality_scores)
-
c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
#print 'Calling myalign...'
# calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
print_matrix)
- #print 'After calling myalign...'
- #print 'Calling getAlignmentResults...'
-
c_SpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*num_path[exampleIdx]))
c_EstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*num_path[exampleIdx]))
c_WeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*num_path[exampleIdx]))
del c_WeightMatch
del c_DPScores
del c_qualityPlifsFeatures
- del currentAlignment
+ #del currentAlignment
newSpliceAlign = newSpliceAlign.reshape(num_path[exampleIdx],dna_len)
newWeightMatch = newWeightMatch.reshape(num_path[exampleIdx],mm_len)
+
+ newQualityPlifsFeatures = newQualityPlifsFeatures.reshape(num_path[exampleIdx],Conf.totalQualSuppPoints)
# Calculate weights of the respective alignments Note that we are
# calculating n-best alignments without hamming loss, so we
# have to keep track which of the n-best alignments correspond to
# equivalence to the true alignment for each decoded alignment.
true_map = [0]*(num_path[exampleIdx]+1)
true_map[0] = 1
- path_loss = [0]*(num_path[exampleIdx])
for pathNr in range(num_path[exampleIdx]):
#print 'decodedWeights'
acc_supp)
decodedQualityFeatures = zeros((Conf.totalQualSuppPoints,1))
- for qidx in range(Conf.totalQualSuppPoints):
- decodedQualityFeatures[qidx] = newQualityPlifsFeatures[(pathNr*Conf.totalQualSuppPoints)+qidx]
-
- #pdb.set_trace()
-
- path_loss[pathNr] = 0
- # sum up positionwise loss between alignments
- for alignPosIdx in range(newSpliceAlign[pathNr,:].shape[1]):
- if newSpliceAlign[pathNr,alignPosIdx] != trueSpliceAlign[alignPosIdx]:
- path_loss[pathNr] += 1
-
- #pdb.set_trace()
-
+ decodedQualityFeatures = newQualityPlifsFeatures[pathNr,:].T
# Gewichte in restliche Zeilen der Matrix speichern
- wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch[pathNr,:].T, decodedQualityFeatures])
- allWeights[:,pathNr+1] = wp
-
- #pdb.set_trace()
+ allWeights[:,pathNr+1] = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch[pathNr,:].T, decodedQualityFeatures[:]])
hpen = mat(h.penalties).reshape(len(h.penalties),1)
dpen = mat(d.penalties).reshape(len(d.penalties),1)
apen = mat(a.penalties).reshape(len(a.penalties),1)
- features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties])
+ features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties[:]])
- AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
+ zero_out(features,intervals)
+ zero_out(allWeights[:,pathNr+1],intervals)
- # Check wether scalar product + loss equals viterbi score
+ #pdb.set_trace()
+
+ AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
distinct_scores = False
if math.fabs(AlignmentScores[pathNr] - AlignmentScores[pathNr+1]) > 1e-5:
if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) <= 1e-5:
pdb.set_trace()
+ if exampleIdx == 1:
+ self.plog(" scalar prod (correct) : %f\n"%AlignmentScores[0])
+ self.plog(" scalar prod (pred.) : %f %f\n"%(newDPScores[pathNr,0],AlignmentScores[pathNr+1]))
+
+ pdb.set_trace()
+
# if the pathNr-best alignment is very close to the true alignment consider it as true
if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
true_map[pathNr+1] = 1
- # assert AlignmentScores[0] > max(AlignmentScores[1:]) + 1e-6, pdb.set_trace()
+ #pdb.set_trace()
+
+ #assert AlignmentScores[0] <= max(AlignmentScores[1:]) + 1e-6, pdb.set_trace()
+ if not trueAlignmentScore <= max(AlignmentScores[1:]) + 1e-6:
+ suboptimal_example += 1
+ self.plog("suboptimal_example %d\n" %exampleIdx)
+
# the true label sequence should not have a larger score than the maximal one WHYYYYY?
# this means that all n-best paths are to close to each other
firstFalseIdx = map_idx
break
+ #if exampleIdx == 1:
+ if False:
+
+ self.plog("Is considered as: %d\n" % true_map[1])
+
+ result_len = currentAlignment.getResultLength()
+ c_dna_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
+ c_est_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
+
+ currentAlignment.getAlignmentArrays(c_dna_array,c_est_array)
+
+ dna_array = [0.0]*result_len
+ est_array = [0.0]*result_len
+
+ for r_idx in range(result_len):
+ dna_array[r_idx] = c_dna_array[r_idx]
+ est_array[r_idx] = c_est_array[r_idx]
+
+ _newSpliceAlign = newSpliceAlign[0].flatten().tolist()[0]
+ _newEstAlign = newEstAlign[0].flatten().tolist()[0]
+ line1,line2,line3 = pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array)
+ self.plog(line1+'\n')
+ self.plog(line2+'\n')
+ self.plog(line3+'\n')
+
# if there is at least one useful false alignment add the
# corresponding constraints to the optimization problem
if firstFalseIdx != -1:
- trueWeights = allWeights[:,0]
+ trueWeights = trueWeight
firstFalseWeights = allWeights[:,firstFalseIdx]
differenceVector = trueWeights - firstFalseWeights
#pdb.set_trace()
#
# end of one iteration through all examples
#
- if self.noImprovementCtr == numExamples+1:
+
+ self.plog("suboptimal rounds %d\n" %suboptimal_example)
+
+ if self.noImprovementCtr == numExamples*2:
break
iteration_nr += 1
acc_supp = Acceptors[exampleIdx]
# Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
- trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
+ if Conf.mode == 'using_quality_scores':
+ trueSpliceAlign, trueWeightMatch, trueWeightQuality =\
+ computeSpliceAlignWithQuality(dna, exons, est, quality, qualityPlifs)
+ else:
+ trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
# Calculate the weights
trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
_newSpliceAlign = newSpliceAlign.flatten().tolist()[0]
_newEstAlign = newEstAlign.flatten().tolist()[0]
- #line1,line2,line3 = pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array)
- #self.plog(line1+'\n')
- #self.plog(line2+'\n')
- #self.plog(line3+'\n')
+ line1,line2,line3 = pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array)
+ self.plog(line1+'\n')
+ self.plog(line2+'\n')
+ self.plog(line3+'\n')
weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a, h, newSpliceAlign.flatten().tolist()[0], don_supp, acc_supp)
e1_b_off,e1_e_off,e2_b_off,e2_e_off = 0,0,0,0
self.plog(("Example %d"%exampleIdx) + str(exons) + " "+ str(newExons) + "\n")
- self.plog(("SpliceAlign " + str(SpliceAlign)+ "\n"))
+ #self.plog(("SpliceAlign " + str(SpliceAlign)+ "\n"))
+ self.plog(("read: " + str(est)+ "\n"))
if len(newExons) == 4:
e1_begin,e1_end = newExons[0],newExons[1]
return line1,line2,line3
- #exon_length[-1] = exon_length[-1] - tEnd + tIDX[last_identity] + 1
-
- #tStart = tIDX[first_identity]
- #tStarts[0] = tIDX[first_identity]
- #tEnd = tIDX[last_identity]
- #tEnds[-1] = tIDX[last_identity]
- #qStart = qIDX[first_identity]
- #qStarts[0] = qIDX[first_identity]
- #qEnd = qIDX[last_identity]
- #qEnds[-1] = qIDX[last_identity]
-
- #align_info.qStart = qStart
- #align_info.qEnd = qEnd
- #align_info.tStart = tStart
- #align_info.tEnd = tEnd
- #align_info.num_exons = num_exons
- #align_info.qExonSizes = qExonSizes
- #align_info.qStarts = qStarts
- #align_info.qEnds = qEnds
- #align_info.tExonSizes =tExonSizes
- #align_info.tStarts = tStarts
- #align_info.tEnds = tEnds
- #align_info.exon_length = exon_length
- #align_info.identity = identity
- #align_info.ssQuality = ssQuality
- #align_info.exonQuality = exonQuality
- #align_info.comparison = comparison
- #align_info.qIDX = qIDX
- #align_info.tIDX = tIDX
- #align_info.dna_letters = dna_letters
- #align_info.est_letters = est_letters
+def calc_info(acc,don,exons):
+ min_score = 100
+ max_score = -100
+
+ for elem in acc:
+ for score in elem:
+ if score != -inf:
+ min_score = min(min_score,score)
+ max_score = max(max_score,score)
+
+ print 'Acceptors max/min: %f / %f' % (max_score,min_score)
+
+ min_score = 100
+ max_score = -100
+
+ for elem in don:
+ for score in elem:
+ if score != -inf:
+ min_score = min(min_score,score)
+ max_score = max(max_score,score)
+
+ print 'Donor max/min: %f / %f' % (max_score,min_score)
+
+ min_score = 10000
+ max_score = 0
+ mean_intron_len = 0
+
+ for elem in exons:
+ intron_len = int(elem[1,0] - elem[0,1])
+ mean_intron_len += intron_len
+ min_score = min(min_score,intron_len)
+ max_score = max(max_score,intron_len)
+
+ mean_intron_len /= 1.0*len(exons)
+ print 'Intron length max/min: %d / %d mean length %f' % (max_score,min_score,mean_intron_len)
+def zero_out(vec,intervals):
+ for interval in intervals:
+ for pos in interval:
+ vec[pos] = 0.0
if __name__ == '__main__':
qpalma = QPalma()
}
int parse_gff(char* filename,FILE* fid,struct gene*** allGenes);
-
void sort_genes(struct gene*** allGenes, int numGenes);
-
void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes,FILE* out_fs);
-
void combine_info(int exon_stop, int exon_start, Read** upstream, int up_size, Read** downstream, int down_size, FILE* out_fs,const char* gene_id, int exon_idx);
-
int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FILE* out_fs,const char* gene_id, int exon_idx);
-
-int fitting(char* up_prb, char* down_prb);
-
+int fitting(char* up_prb, char* down_prb, int u_off, int d_off);
void remove_ambiguities(char * old_seq, char* new_seq);
static char *info = "Usage is:\n./filterReads gff reads output";
int read_nr = 1;
int combined_reads = 0;
+
int main(int argc, char* argv[]) {
if(argc != 5) {
goto next_read;
}
- FA(currentGene != 0);
-
- if (!(currentGene->start <= pos && (pos + read_size-1) <= currentGene->stop)) { // read is not within gene borders
-
- if ( currentGene->stop < (pos + read_size-1) ) { // go to next gene
-
- if (uov != 0 && dov != 0)
- combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
-
- for(up_idx=0;up_idx<uov;up_idx++) {
- free_read(upstream_overlap[up_idx]);
- free(upstream_overlap[up_idx]);
- }
-
- for(down_idx=0;down_idx<dov;down_idx++) {
- free_read(downstream_overlap[down_idx]);
- free(downstream_overlap[down_idx]);
- }
-
- uov = dov = 0;
+ FA(strlen(seq) >= read_size);
- gene_idx++;
- exon_idx = 1;
- currentGene = (*allGenes)[gene_idx];
- while( currentGene == 0 && gene_idx < numGenes) {
- currentGene = (*allGenes)[gene_idx];
- gene_idx++;
- }
- //printf("currentGene->start / currentGene->stop %d/%d pos is %d\n",currentGene->start,currentGene->stop,pos);
- continue;
- }
-
- if ( pos < currentGene->start ) { // go to next read
- skippedReadCtr++;
-
- next_read:
-
- lineBeginPtr = lineEndPtr;
- while (*(char*)lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
- lineEndPtr++;
- readCtr += 1;
- current_line = strncpy(current_line,lineBeginPtr,lineEndPtr-lineBeginPtr);
- current_line[lineEndPtr-lineBeginPtr] = '\0';
- continue;
- }
+ FA(currentGene != 0);
- } else { // read IS within gene borders
+ if ( currentGene->start <= pos && (pos + read_size-1) <= currentGene->stop) { // read is not within gene borders
exon_label:
free_read(downstream_overlap[down_idx]);
free(downstream_overlap[down_idx]);
}
+
uov = dov = 0;
exon_idx++;
uselessReadCtr++;
goto next_read; // read was not useful i.e. not overlapping/starting at exon boundaries
+
+ } else { // read is not within gene borders
+
+ if (uov != 0 && dov != 0)
+ combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
+
+ for(up_idx=0;up_idx<uov;up_idx++) {
+ free_read(upstream_overlap[up_idx]);
+ free(upstream_overlap[up_idx]);
+ }
+
+ for(down_idx=0;down_idx<dov;down_idx++) {
+ free_read(downstream_overlap[down_idx]);
+ free(downstream_overlap[down_idx]);
+ }
+
+ uov = dov = 0;
+
+ if ( currentGene->stop < (pos + read_size-1) ) { // go to next gene
+ gene_idx++;
+ exon_idx = 1;
+ currentGene = (*allGenes)[gene_idx];
+ while( currentGene == 0 && gene_idx < numGenes) {
+ currentGene = (*allGenes)[gene_idx];
+ gene_idx++;
+ }
+ //printf("currentGene->start / currentGene->stop %d/%d pos is %d\n",currentGene->start,currentGene->stop,pos);
+ continue;
+ }
+
+ if ( pos < currentGene->start ) { // go to next read
+ skippedReadCtr++;
+
+ next_read:
+
+ lineBeginPtr = lineEndPtr;
+ while (*(char*)lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
+ lineEndPtr++;
+ readCtr += 1;
+ current_line = strncpy(current_line,lineBeginPtr,lineEndPtr-lineBeginPtr);
+ current_line[lineEndPtr-lineBeginPtr] = '\0';
+ continue;
+ }
}
}
- combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
+ if (uov != 0 && dov != 0)
+ combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
for(up_idx=0;up_idx<uov;up_idx++) {
free_read(upstream_overlap[up_idx]);
void combine_info(int exon_stop, int exon_start, Read** upstream, int up_size, Read** downstream, int down_size, FILE* out_fs,const char* gene_id, int exon_idx) {
- printf("up/down size is %d/%d\n",up_size,down_size);
+ //printf("up/down size is %d/%d\n",up_size,down_size);
- int up_idx, down_idx, overlap, success;
+ int up_idx, down_idx, success;
char* up_used_flag = calloc(up_size,sizeof(char));
char* down_used_flag = calloc(down_size,sizeof(char));
currentUpRead = upstream[up_idx];
- overlap = exon_stop - currentUpRead->pos;
-
for(down_idx=0;down_idx<down_size;down_idx++) {
- if( down_used_flag[down_idx] == 1)
+
+ if( up_used_flag[up_idx] == 1 || down_used_flag[down_idx] == 1)
continue;
currentDownRead = downstream[down_idx];
success = join_reads(exon_stop,exon_start,currentUpRead,currentDownRead,out_fs,gene_id,exon_idx);
- if(success) {
+ if(success == 1) {
up_used_flag[up_idx] = 1;
down_used_flag[down_idx] = 1;
}
u_size = -1;
d_size = -1;
- printf("possible range %d %d\n",up_range,down_range);
+ //printf("possible range %d %d\n",up_range,down_range);
if(up_range+down_range < read_size)
return 0;
int p_stop = exon_start + d_size - 1;
u_off = p_start - up_read->pos;
- d_off = exon_start - down_read->pos + 1;
+ d_off = exon_start - down_read->pos;
FA(u_size != -1);
FA(d_size != -1);
FA(u_size + d_size == read_size);
- printf("pos: %d %d %d %d\n",p_start,exon_stop,exon_start,p_stop);
- printf("u/d size: %d %d\n",u_size,d_size);
- printf("u/d off: %d %d\n",u_off,d_off);
-
+ //printf("%lu %lu\n",up_read->id,down_read->id);
+
remove_ambiguities(up_read->seq,new_up_seq);
remove_ambiguities(down_read->seq,new_down_seq);
strncpy(new_chastity+u_size, down_read->chastity+d_off, d_size);
new_chastity[read_size] = '\0';
- //printf("%s\n",new_seq);
- //printf("%s\n",new_prb);
- //printf("%s\n",new_cal_prb);
- //printf("%s\n",new_chastity);
+ int status = fitting(up_read->prb,down_read->prb,u_off,d_off);
+
+ if(status != 1)
+ return status;
+
+ //if( read_nr == 13 || read_nr == 14 ) {
+ // printf("read nr %d",read_nr);
+ // printf("pos: %d %d %d %d\n",p_start,exon_stop,exon_start,p_stop);
+ // printf("u/d range: %d %d\n",up_range,down_range);
+ // printf("u/d size: %d %d\n",u_size,d_size);
+ // printf("u/d off: %d %d\n",u_off,d_off);
+ // printf("******************\n");
+
+ // printf("%s\n",up_read->seq);
+ // printf("%s\n",down_read->seq);
+
+ // printf("******************\n");
+
+ // printf("%s\n",new_up_seq);
+ // printf("%s\n",new_down_seq);
+
+ // printf("******************\n");
+ // printf("%s\n",new_seq);
+ // printf("%s\n",new_prb);
+ // printf("%s\n",new_cal_prb);
+ // printf("%s\n",new_chastity);
+ //}
//// The four last integers specify the positions of
//// p_start : the position in the dna where the first read starts
read_nr,up_read->chr,up_read->strand,new_seq,u_size,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop);
read_nr++;
+ combined_reads++;
free(new_up_seq);
free(new_down_seq);
return 1;
}
-int fitting(char* up_prb, char* down_prb) {
- double epsilon_mean = 30.0;
- double epsilon_var = 30.0;
- int w_size = 6;
-
+int fitting(char* up_prb, char* down_prb, int u_off, int d_off) {
double current_mean_up = 0;
- double current_variance_up = 0;
double current_mean_down = 0;
- double current_variance_down = 0;
- double* mean_up = malloc(sizeof(double)*read_size-2*w_size);
- double* variance_up = malloc(sizeof(double)*read_size-2*w_size);
- double* mean_down = malloc(sizeof(double)*read_size-2*w_size);
- double* variance_down = malloc(sizeof(double)*read_size-2*w_size);
+ int w_size = 3;
- int iidx = -1;
- int uidx;
- int didx;
+ int idx;
- for(uidx=iidx-w_size;uidx<iidx;uidx++) {
- didx = uidx+w_size;
- current_mean_up += up_prb[uidx]-50;
- current_mean_down += up_prb[didx]-50;
- }
+ for(idx=0;idx<w_size;idx++) {
+ current_mean_up += up_prb[u_off+idx]-50;
+ current_mean_up += up_prb[u_off-idx]-50;
+ current_mean_up -= up_prb[idx]-50;
- current_mean_up /= w_size;
- current_mean_down /= w_size;
-
- for(uidx=iidx-w_size;uidx<iidx;uidx++) {
- didx = uidx+w_size;
- current_variance_up += pow(up_prb[uidx]-50 - current_mean_up,2);
- current_variance_down += pow(up_prb[didx]-50 - current_mean_up,2);
+ current_mean_down += up_prb[d_off+idx]-50;
+ current_mean_down += up_prb[d_off-idx]-50;
+ current_mean_down -= up_prb[idx]-50;
}
- current_variance_up /= (w_size-1);
- current_variance_down /= (w_size-1);
-
- for(iidx=w_size;iidx<30;iidx++) {
- for(uidx=iidx-w_size;uidx<iidx;uidx++) {
- didx = uidx+w_size;
- mean_up[iidx-w_size] += down_prb[uidx]-50;
- mean_down[iidx-w_size] += down_prb[didx]-50;
-
- }
- mean_up[iidx-w_size] /= w_size;
- mean_down[iidx-w_size] /= w_size;
+ current_mean_up /= (2*w_size+1);
+ current_mean_down /= (2*w_size+1);
- for(uidx=iidx-w_size;uidx<iidx;uidx++) {
- didx = uidx+w_size;
- variance_up[iidx-w_size] += pow(down_prb[uidx]-50 - mean_up[iidx-w_size],2);
- variance_down[iidx-w_size] += pow(down_prb[didx]-50 - mean_down[iidx-w_size],2);
- }
- variance_up[iidx-w_size] /= (w_size-1);
- variance_down[iidx-w_size] /= (w_size-1);
-
- //printf("means: %f %f %f %f, variances: %f %f %f %f\n",current_mean_up,current_mean_down,mean_up,mean_down,current_variance_up,current_variance_down,variance_up,variance_down);
- //if ( abs(current_mean_up - mean_up) < epsilon_mean && abs(current_variance_up - variance_up) < epsilon_var
- //&& abs(current_mean_down - mean_down) < epsilon_mean && abs(current_variance_down - variance_down) < epsilon_var )
- // return iidx;
- }
+ double ratio;
+ if( current_mean_up > current_mean_down)
+ ratio = current_mean_down / current_mean_up;
+ else
+ ratio = current_mean_up / current_mean_down;
- int bidx;
- int bestIdx = -1;
- double min = 1000.0;
- for(bidx=0;bidx<read_size-2*w_size;bidx++) {
- if ( abs(current_mean_up - mean_up[bidx]) < epsilon_mean && abs(current_variance_up - variance_up[bidx]) < epsilon_var
- && abs(current_mean_down - mean_down[bidx]) < epsilon_mean && abs(current_variance_down - variance_down[bidx]) < epsilon_var ) {
- if ( abs(current_mean_up - mean_up[bidx]) + abs(current_variance_up - variance_up[bidx]) + abs(current_mean_down - mean_down[bidx]) + abs(current_variance_down - variance_down[bidx])) {
- min = abs(current_mean_up - mean_up[bidx]) + abs(current_variance_up - variance_up[bidx]) + abs(current_mean_down - mean_down[bidx]) + abs(current_variance_down - variance_down[bidx]);
- bestIdx = bidx;
- }
- }
- }
-
- free(mean_up);
- free(variance_up);
- free(mean_down);
- free(variance_down);
+ if (ratio < 0.5)
+ return 0;
- return bestIdx;
+ return 1;
}
void remove_ambiguities(char * old_seq, char* new_seq) {
# plot quality score transformations
pylab.figure()
for pos,plif in enumerate(qualityPlifs):
- if pos in [1,8,15,22]:
- pylab.plot(plif.limits,plif.penalties,'b^-',hold=True)
+ #if pos in [1,8,15,22]:
+ # pylab.plot(plif.limits,plif.penalties,'b^-',hold=True)
+
+ pylab.plot(plif.limits,plif.penalties,'b^-',hold=True)
pylab.xlabel('quality value',fontsize=20)
pylab.ylabel('transition score',fontsize=20)