import qpalma.Configuration as Conf
+warning = """
+ #########################################################
+ WARNING ! Disabled the use of non-bracket reads WARNING !
+ #########################################################
+
+ """
+
+
help = """
Usage of this script is:
if instance_counter % 100 == 0:
print 'processed %d examples' % instance_counter
- if instance_counter == 2000:
+ if instance_counter == 5000:
break
print 'Full dataset has size %d' % len(Sequences)
chr = fRead['chr']
strand = fRead['strand']
- quality = fRead['prb']
+ #quality = fRead['prb']
+ quality = fRead['cal_prb']
+ #quality = fRead['chastity']
spos = fRead['splitpos']
currentReadSeq = fRead['seq']
currentReadSeq = currentReadSeq.lower()
- if currentReadSeq.find('[') == -1:
- return nil
+ #if currentReadSeq.find('[') == -1:
+ # print warning
+ # return nil
originalReadSeq = fRead['seq']
seq_has_indels = False
for elem in currentReadSeq:
- assert elem in extended_alphabet, 'Invalid char (extended_alphabet) in read: \"%s\"' % elem
+ #assert elem in extended_alphabet, 'Invalid char (extended_alphabet) in read: \"%s\"' % elem
+ if not elem in extended_alphabet:
+ return nil
+
if elem == ']':
seq_has_indels = True
for elem in currentDNASeq:
- assert elem in alphabet, 'Invalid char in dna: \"%s\"' % elem
+ #assert elem in alphabet, 'Invalid char in dna: \"%s\"' % elem
+ if not elem in alphabet:
+ return nil
if strand == '-':
try:
if not dna_fragment_1.replace('-','') == dna_annot_1:
print 'genomic seq mismatch'
+ print 'orignal read:\t %s ' % originalReadSeq
+ print '1st/2nd read:\t %s %s' % (firstReadSeq,secondReadSeq)
+ print 'dna_fragment_1:\t %s' % dna_fragment_1
+ print 'dna_annot 1+2:\t %s %s' % (dna_annot_1,dna_annot_2)
return nil
if not dna_fragment_2.replace('-','') == dna_annot_2:
print 'genomic seq mismatch'
- print originalReadSeq
- print firstReadSeq
- print secondReadSeq
- print dna_annot_1 + dna_annot_2
+ print 'orignal read:\t %s ' % originalReadSeq
+ print '1st/2nd read:\t %s %s' % (firstReadSeq,secondReadSeq)
+ print 'dna_fragment_2:\t %s' % dna_fragment_2
+ print 'dna_annot 1+2:\t %s %s' % (dna_annot_1,dna_annot_2)
return nil
#print 'successful'
for elem in currentReadSeq:
assert elem in alphabet, 'Invalid char (alphabet) in read: \"%s\"' % elem
- assert p_start < exon_stop < exon_start < p_stop, 'Invalid Indices'
+ #assert p_start < exon_stop < exon_start < p_stop, pdb.set_trace()
assert p_stop - p_start >= Conf.read_size
currentExons = zeros((2,2),dtype=numpy.int)
return nil
except:
- pdb.set_trace()
+ return nil
+ #pdb.set_trace()
# now we want to use interval_query to get the predicted splice scores
# trained on the TAIR sequence and annotation
continue
don[position-1] = scores[i]
except:
- pdb.set_trace()
+ return nil
+ #pdb.set_trace()
don = [-inf] + don[:-1]
- assert ag_tuple_pos == [p for p,e in enumerate(acc) if e != -inf], pdb.set_trace()
- assert gt_tuple_pos == [p for p,e in enumerate(don) if e != -inf], pdb.set_trace()
+ #assert ag_tuple_pos == [p for p,e in enumerate(acc) if e != -inf], pdb.set_trace()
+ #assert gt_tuple_pos == [p for p,e in enumerate(don) if e != -inf], pdb.set_trace()
+ #assert (len(currentDNASeq)) == len(acc), pdb.set_trace()
+ #assert (len(currentDNASeq)) == len(don), pdb.set_trace()
- assert (len(currentDNASeq)) == len(acc), pdb.set_trace()
- assert (len(currentDNASeq)) == len(don), pdb.set_trace()
- acc = [-inf] + acc[:-1]
+ if not ag_tuple_pos == [p for p,e in enumerate(acc) if e != -inf]:
+ return nil
+ if not gt_tuple_pos == [p for p,e in enumerate(don) if e != -inf]:
+ return nil
+ if not (len(currentDNASeq)) == len(acc):
+ return nil
+
+ if not (len(currentDNASeq)) == len(don):
+ return nil
+
+ acc = [-inf] + acc[:-1]
return currentDNASeq, acc, don, currentExons, currentReadSeq, originalReadSeq, quality, spos
#compile_d(Conf.gff_fn,Conf.dna_flat_fn,Conf.filtered_fn,Conf.remapped_fn,Conf.tmp_dir,Conf.dataset_fn,False)
#compile_d(Conf.gff_fn,Conf.dna_flat_fn,Conf.filtered_fn,Conf.remapped_fn,Conf.tmp_dir,Conf.dataset_fn,True)
- pdb.set_trace()
-
data_filename = Conf.dataset_fn
Sequences, Acceptors, Donors, Exons, Ests, OriginalEsts, Qualities, SplitPos = paths_load_data(data_filename,'training',None,self.ARGS)
self.Donors = Donors
self.Acceptors = Acceptors
- calc_info(self.Acceptors,self.Donors,self.Exons)
+ calc_info(self.Acceptors,self.Donors,self.Exons,self.Qualities)
+ pdb.set_trace()
print 'leaving constructor...'
def plog(self,string):
self.logfh.write(string)
self.logfh.flush()
- def train(self,value):
+ def train(self):
self.logfh = open('_qpalma_train.log','w+')
beg = Conf.training_begin
numq = Conf.numQualSuppPoints
totalQualSP = Conf.totalQualSuppPoints
- i1 = range(0,lengthSP)
- i2 = range(lengthSP,lengthSP+donSP)
- i3 = range(lengthSP+donSP,lengthSP+donSP+accSP)
- i4 = range(lengthSP+donSP+accSP,lengthSP+donSP+accSP+mmatrixSP)
- i5 = range(lengthSP+donSP+accSP+mmatrixSP,lengthSP+donSP+accSP+mmatrixSP+totalQualSP)
+ #i1 = range(0,lengthSP)
+ #i2 = range(lengthSP,lengthSP+donSP)
+ #i3 = range(lengthSP+donSP,lengthSP+donSP+accSP)
+ #i4 = range(lengthSP+donSP+accSP,lengthSP+donSP+accSP+mmatrixSP)
+ #i5 = range(lengthSP+donSP+accSP+mmatrixSP,lengthSP+donSP+accSP+mmatrixSP+totalQualSP)
- offset =lengthSP+donSP+accSP+mmatrixSP
+ #offset =lengthSP+donSP+accSP+mmatrixSP
#intervals = [[offset+2,offset+3]] #i5,\ #i2,#i3,#i4,#i5]
#intervals = []
if Conf.mode == 'using_quality_scores':
quality = Qualities[exampleIdx]
- #quality = [(int(math.fabs(e))) for e in quality]
+ # if Conf.disable_quality_scores
#quality = [40]*len(est)
exons = Exons[exampleIdx]
don_supp = Donors[exampleIdx]
acc_supp = Acceptors[exampleIdx]
- #for idx,elem in enumerate(don_supp):
- # if elem != -inf:
- # don_supp[idx] = 0.0
+ # if Conf.disable_splice_signals
+ #don_supp = [0.0]*len(don_supp)
+ #acc_supp = [0.0]*len(acc_supp)
- #for idx,elem in enumerate(acc_supp):
- # if elem != -inf:
- # acc_supp[idx] = 0.0
+ for idx,elem in enumerate(don_supp):
+ if elem != -inf:
+ don_supp[idx] = 0.0
+
+ for idx,elem in enumerate(acc_supp):
+ if elem != -inf:
+ acc_supp[idx] = 0.0
# Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
if Conf.mode == 'using_quality_scores':
# check that splice site scores are at dna positions as expected by
# the dynamic programming component
- for d_pos in [pos for pos,elem in enumerate(donor) if elem != -inf]:
- assert dna[d_pos] == 'g' and (dna[d_pos+1] == 'c'\
- or dna[d_pos+1] == 't'), pdb.set_trace()
-
- for a_pos in [pos for pos,elem in enumerate(acceptor) if elem != -inf]:
- assert dna[a_pos-1] == 'a' and dna[a_pos] == 'g', pdb.set_trace()
+ #for d_pos in [pos for pos,elem in enumerate(donor) if elem != -inf]:
+ # assert dna[d_pos] == 'g' and (dna[d_pos+1] == 'c'\
+ # or dna[d_pos+1] == 't'), pdb.set_trace()
+ #
+ #for a_pos in [pos for pos,elem in enumerate(acceptor) if elem != -inf]:
+ # assert dna[a_pos-1] == 'a' and dna[a_pos] == 'g', pdb.set_trace()
ps = h.convert2SWIG()
def evaluate(self,param_filename):
beg = Conf.prediction_begin
end = Conf.prediction_end
+ self.logfh = open('_qpalma_predict.log','w+')
# predict on training set
print '##### Prediction on the training set #####'
print '##### Prediction on the test set #####'
self.predict(param_filename,beg,end)
+ self.logfh.close()
#pdb.set_trace()
def predict(self,param_filename,beg,end):
- self.logfh = open('_qpalma_predict.log','w+')
Sequences = self.Sequences[beg:end]
Exons = self.Exons[beg:end]
don_supp = Donors[exampleIdx]
acc_supp = Acceptors[exampleIdx]
+ #don_supp = [0.0]*len(don_supp)
+ #acc_supp = [0.0]*len(acc_supp)
+
# for idx,elem in enumerate(don_supp):
# if elem != -inf:
# don_supp[idx] = 0.0
# if elem != -inf:
# acc_supp[idx] = 0.0
+ for idx,elem in enumerate(don_supp):
+ if elem != -inf:
+ don_supp[idx] = 0.0
+
+ for idx,elem in enumerate(acc_supp):
+ if elem != -inf:
+ acc_supp[idx] = 0.0
+
## Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
#if Conf.mode == 'using_quality_scores':
# trueSpliceAlign, trueWeightMatch, trueWeightQuality =\
print 'Mean of pos. offset:\t\t%.2f\t%.2f\t%.2f\t%.2f' % (mean_e1Begin_neg,mean_e1End_neg,mean_e2Begin_neg,mean_e2End_neg)
print 'Prediction completed'
- self.logfh.close()
def evaluatePositions(self,eBegin,eEnd):
return line1,line2,line3
-def calc_info(acc,don,exons):
+def calc_info(acc,don,exons,qualities):
min_score = 100
max_score = -100
mean_intron_len /= 1.0*len(exons)
print 'Intron length max/min: %d / %d mean length %f' % (max_score,min_score,mean_intron_len)
+ min_score = 10000
+ max_score = 0
+ mean_quality = 0
+
+ for qVec in qualities:
+ for elem in qVec:
+ min_score = min(min_score,elem)
+ max_score = max(max_score,elem)
+
+ print 'Quality values max/min: %d / %d mean' % (max_score,min_score)
+
+
def zero_out(vec,intervals):
for interval in intervals:
for pos in interval:
mode = sys.argv[1]
- if len(sys.argv) == 3 and mode == 'train':
- qpalma.train(int(sys.argv[2]))
+ if len(sys.argv) == 2 and mode == 'train':
+ qpalma.train()
elif len(sys.argv) == 3 and mode == 'predict':
param_filename = sys.argv[2]