From a14a392a972206f8953c7262e23e2fdb2deaa69e Mon Sep 17 00:00:00 2001 From: fabio Date: Fri, 25 Jan 2008 14:33:07 +0000 Subject: [PATCH] + added exons read boundary output to filterReads output + minor modifications of data processing tools to allow for train/predict mode git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@7580 e1793c9e-67f9-0310-80fc-b846ff1f7b36 --- qpalma/Configuration.py | 2 +- qpalma/DataProc.py | 93 ++++++++++++++++++---------------- qpalma/computeSpliceWeights.py | 2 +- qpalma/tools/splicesites.py | 4 +- scripts/qpalma_predict.py | 61 +++++++++++++--------- scripts/qpalma_train.py | 54 ++++++++++++++------ tools/data_tools/filterReads.c | 36 +++++++++---- 7 files changed, 152 insertions(+), 100 deletions(-) diff --git a/qpalma/Configuration.py b/qpalma/Configuration.py index 4334ef9..36db5df 100644 --- a/qpalma/Configuration.py +++ b/qpalma/Configuration.py @@ -218,7 +218,7 @@ mode = 'using_quality_scores' numDonSuppPoints = 30 numAccSuppPoints = 30 numLengthSuppPoints = 30 -numQualSuppPoints = 16 +numQualSuppPoints = 4 min_qual = -1 max_qual = 40 diff --git a/qpalma/DataProc.py b/qpalma/DataProc.py index 18efe36..33b2995 100644 --- a/qpalma/DataProc.py +++ b/qpalma/DataProc.py @@ -10,8 +10,7 @@ import io_pickle import scipy.io import pdb - -def paths_load_data_solexa(expt,genome_info,PAR): +def paths_load_data_solexa(expt,genome_info,PAR,test=False): # expt can be 'training','validation' or 'test' assert expt in ['training','validation','test'] @@ -32,7 +31,7 @@ def paths_load_data_solexa(expt,genome_info,PAR): for line in open(est_filename): line = line.strip() - chr,strand,seq,splitpos,length,prb,cal,chastity,gene_id,exon_idx = line.split() + chr,strand,seq,splitpos,length,prb,cal,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split() splitpos = int(splitpos) length = int(length) prb = [ord(elem)-50 for elem in prb] @@ -48,59 +47,63 @@ def paths_load_data_solexa(expt,genome_info,PAR): currentSeq = tair7_seq[gene_id+'.1']['sequence'].lower() except: continue + + if len(currentSeq) < (currentGene.stop - currentGene.start): + continue - #assert currentSize == len(Sequences[-1]), 'gene_size %d / seq size %d' % (currentSize,len(Sequences[-1])) - #Acceptors.append([-inf]*currentSize) - #Donors.append([-inf]*currentSize) + oldSeq = currentSeq - exon_idx = int(exon_idx) + p_start = int(p_start) + exon_stop = int(exon_stop) + exon_start = int(exon_start) + p_stop = int(p_stop) + + assert p_start <= exon_stop <= exon_start <= p_stop, 'Invalid Indices' + assert p_stop - p_start >= 36 - assert exon_idx > 0 - #print "%d " % exon_idx, currentExons = zeros((2,2)) - #for idx in range(len(currentGene.exons)): - # currentExons[idx,0] = currentGene.exons[idx][0]-currentGene.start - # currentExons[idx,1] = currentGene.exons[idx][1]-currentGene.start - try: - currentExons[0,0] = currentGene.exons[exon_idx-1][0]-currentGene.start - currentExons[0,1] = currentGene.exons[exon_idx-1][1]-currentGene.start + currentExons[0,0] = p_start-currentGene.start + currentExons[0,1] = exon_stop-currentGene.start - currentExons[1,0] = currentGene.exons[exon_idx][0]-currentGene.start - currentExons[1,1] = currentGene.exons[exon_idx][1]-currentGene.start + currentExons[1,0] = exon_start-currentGene.start + currentExons[1,1] = p_stop-currentGene.start - cut_offset = 500 + import copy + oldExons = copy.deepcopy(currentExons) + + up_cut = int(currentExons[0,0]) + down_cut = int(currentExons[1,1]) - up_cut = int(currentExons[0,0]) - cut_offset - down_cut = int(currentExons[1,1]) + cut_offset + if test: + up_cut = up_cut-500 if up_cut < 0: up_cut = 0 + down_cut = down_cut+500 + if down_cut > len(currentSeq): down_cut = len(currentSeq) - assert down_cut - up_cut > 0 - assert down_cut - up_cut <= len(currentSeq) + currentSeq = currentSeq[up_cut:down_cut] - currentSeq = currentSeq[up_cut:down_cut] - - #pdb.set_trace() + currentExons[0,1] += 1 + #currentExons[1,0] += 1 - if up_cut > 0: - currentExons[0,0] -= up_cut - currentExons[0,1] -= up_cut + if test: + currentExons -= up_cut + else: + currentExons -= currentExons[0,0] - currentExons[1,0] -= up_cut - currentExons[1,1] -= up_cut - - except: - continue - #pdb.set_trace() + try: + if not (currentSeq[int(currentExons[0,1])] == 'g' and currentSeq[int(currentExons[0,1])+1] == 't'): + continue - #pdb.set_trace() + if not (currentSeq[int(currentExons[1,0])-1] == 'g' and currentSeq[int(currentExons[1,0])-2] == 'a'): + continue - currentExons[0,1] += 1 - #currentExons[1,0] += 1 + except: + pdb.set_trace() Sequences.append(currentSeq) @@ -114,14 +117,14 @@ def paths_load_data_solexa(expt,genome_info,PAR): SplitPositions.append(splitpos) - if len(Sequences[-1]) == 2755: - Sequences = Sequences[:-1] - Acceptors = Acceptors[:-1] - Donors = Donors[:-1] - Exons = Exons[:-1] - Ests = Ests[:-1] - Qualities = Qualities[:-1] - SplitPositions = SplitPositions[:-1] + #if len(Sequences[-1]) == 2755: + # Sequences = Sequences[:-1] + # Acceptors = Acceptors[:-1] + # Donors = Donors[:-1] + # Exons = Exons[:-1] + # Ests = Ests[:-1] + # Qualities = Qualities[:-1] + # SplitPositions = SplitPositions[:-1] print 'found %d examples' % len(Sequences) diff --git a/qpalma/computeSpliceWeights.py b/qpalma/computeSpliceWeights.py index 1bfb21e..14b300c 100644 --- a/qpalma/computeSpliceWeights.py +++ b/qpalma/computeSpliceWeights.py @@ -46,7 +46,7 @@ def calculateWeights(plf, scores): return currentWeight -def computeSpliceWeights(d, a, h, SpliceAlign, don_supp, acc_supp,dec=False): +def computeSpliceWeights(d, a, h, SpliceAlign, don_supp, acc_supp): #################################################################################### # 1. Donor: In don_supp stehen Werte der SVM., in SpliceAlign die Einsen #################################################################################### diff --git a/qpalma/tools/splicesites.py b/qpalma/tools/splicesites.py index ac64a9d..e20e9dd 100644 --- a/qpalma/tools/splicesites.py +++ b/qpalma/tools/splicesites.py @@ -380,8 +380,8 @@ def getDonAccScores(Sequences): acc_score = [acc_score[0]] + acc_score acc_score = acc_score[:-1] Acceptors.append(acc_score) - print str((max(don_score),min(numpy.where(numpy.isinf(don_score),100,don_score)))) - print str((max(acc_score),min(numpy.where(numpy.isinf(acc_score),100,acc_score)))) + #print str((max(don_score),min(numpy.where(numpy.isinf(don_score),100,don_score)))) + #print str((max(acc_score),min(numpy.where(numpy.isinf(acc_score),100,acc_score)))) #pdb.set_trace() diff --git a/scripts/qpalma_predict.py b/scripts/qpalma_predict.py index 241410d..b5cbaa2 100644 --- a/scripts/qpalma_predict.py +++ b/scripts/qpalma_predict.py @@ -35,6 +35,8 @@ import qpalma.Configuration from qpalma.Plif import Plf from qpalma.Helpers import * +from qpalma.tools.splicesites import getDonAccScores + def getQualityFeatureCounts(qualityPlifs): weightQuality = qualityPlifs[0].penalties for currentPlif in qualityPlifs[1:]: @@ -70,18 +72,26 @@ class QPalma: use_quality_scores = False elif Conf.mode == 'using_quality_scores': - #Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos = paths_load_data_solexa('training',None,self.ARGS) - #end = 50 - #Sequences = Sequences[:end] - #Acceptors = Acceptors[:end] - #Donors = Donors[:end] - #Exons = Exons[:end] - #Ests = Ests[:end] - #Qualities = Qualities[:end] - #SplitPos = SplitPos[:end] + Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos =\ + paths_load_data_solexa('training',None,self.ARGS,True) + + begin = 200 + end = 400 + Sequences = Sequences[begin:end] + Exons = Exons[begin:end] + Ests = Ests[begin:end] + Qualities = Qualities[begin:end] + Acceptors = Acceptors[begin:end] + Donors = Donors[begin:end] + + pdb.set_trace() + + Donors, Acceptors = getDonAccScores(Sequences) + + + #Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000) + #SplitPos = [1]*len(Qualities) - Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000) - SplitPos = [1]*len(Qualities) #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS) #pdb.set_trace() #Qualities = [] @@ -105,7 +115,7 @@ class QPalma: # Initialize parameter vector / param = numpy.matlib.rand(126,1) #param_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/python/elegans.param' - param_filename='/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/param_30.pickle' + param_filename='/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/param_200.pickle' param = load_param(param_filename) # Set the parameters such as limits penalties for the Plifs @@ -401,26 +411,27 @@ def evaluateExample(dna,est,exons,SpliceAlign,newEstAlign,spos): else: oldElem = SpliceAlign[pos-1] - if oldElem == 2 and elem == 0: # start of exon + if oldElem != 0 and elem == 0: # start of exon newExons.append(pos-1) - if oldElem == 0 and elem == 1: # end of exon + if oldElem == 0 and elem != 0: # end of exon newExons.append(pos) - pdb.set_trace() - #up_off = -1 - #down_off = -1 + up_off = -1 + down_off = -1 - #if len(newExons) != 4: - # acc = 0.0 - #else: - # e1_begin,e1_end = newExons[0],newExons[1] - # e2_begin,e2_end = newExons[2],newExons[3] + if len(newExons) != 4: + acc = 0.0 + else: + e1_begin,e1_end = newExons[0],newExons[1] + e2_begin,e2_end = newExons[2],newExons[3] - # up_off = int(math.fabs(e1_end - exons[0,1])) - # down_off = int(math.fabs(e2_begin - exons[1,0])) + up_off = int(math.fabs(e1_end - exons[0,1])) + down_off = int(math.fabs(e2_begin - exons[1,0])) + + pdb.set_trace() - #return up_off,down_off + return up_off,down_off if __name__ == '__main__': qpalma = QPalma() diff --git a/scripts/qpalma_train.py b/scripts/qpalma_train.py index 244eb7f..1eac5c3 100644 --- a/scripts/qpalma_train.py +++ b/scripts/qpalma_train.py @@ -74,19 +74,41 @@ class QPalma: use_quality_scores = False elif Configuration.mode == 'using_quality_scores': - #Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos = paths_load_data_solexa('training',None,self.ARGS) - #end = 50 - #Sequences = Sequences[:end] - #Exons = Exons[:end] - #Ests = Ests[:end] - #Qualities = Qualities[:end] - #SplitPos = SplitPos[:end] - - #Donors, Acceptors = getDonAccScores(Sequences) - - Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000) + filename = 'real_dset_%s'% 'recent' + if True:# not os.path.exists(filename): + Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos = paths_load_data_solexa('training',None,self.ARGS) + + end = 200 + Sequences = Sequences[:end] + Exons = Exons[:end] + Ests = Ests[:end] + Qualities = Qualities[:end] + Acceptors = Acceptors[:end] + Donors = Donors[:end] + + pdb.set_trace() + + Donors, Acceptors = getDonAccScores(Sequences) + #dset = Dataset() + #dset.Sequences = Sequences + #dset.Acceptor = Acceptors + #dset.Donors = Donors + #dset.Exons = Exons + #dset.Ests = Ests + #dset.Qualities = Qualities + #cPickle.dump(dset,open(filename,'w+')) + else: + dset = cPickle.load(open(filename)) + Sequences = dset.Sequences + Acceptors = dset.Acceptors + Donors = dset.Donors + Exons = dset.Exons + Ests = dset.Ests + Qualities = dset.Qualities + + #Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000) #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS) #Qualities = [] #for i in range(len(Ests)): @@ -175,8 +197,8 @@ class QPalma: don_supp = Donors[exampleIdx] acc_supp = Acceptors[exampleIdx] - if exons[-1,1] > len(dna): - continue + #if exons[-1,1] > len(dna): + # continue #pdb.set_trace() # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...) @@ -324,9 +346,9 @@ class QPalma: for pathNr in range(num_path[exampleIdx]): #print 'decodedWeights' - weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a, - h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp, - acc_supp,True) + weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a,\ + h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp,\ + acc_supp) decodedQualityFeatures = zeros((Configuration.totalQualSuppPoints,1)) for qidx in range(Configuration.totalQualSuppPoints): diff --git a/tools/data_tools/filterReads.c b/tools/data_tools/filterReads.c index ccefb19..2dc7229 100644 --- a/tools/data_tools/filterReads.c +++ b/tools/data_tools/filterReads.c @@ -352,6 +352,7 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v int up_idx, down_idx, status; char* upstream_line = malloc(sizeof(char)*256); char* downstream_line = malloc(sizeof(char)*256); + int p_start, p_stop; int buffer_size= 64; @@ -422,7 +423,7 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v new_chastity[0] = '\0'; int splitpos = 0; - fit = fitting(up_prb+(36-overlap),down_prb); + fit = fitting(up_prb+(36-overlap),down_prb+(exon_start-down_pos)); if (fit == -1) continue; @@ -433,18 +434,32 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v new_strand = up_strand; splitpos = overlap; + p_start = up_pos; + p_stop = exon_start+(read_size-overlap); + strncat(new_seq,new_up_seq,overlap); strncat(new_prb,up_prb,overlap); strncat(new_cal_prb,up_cal_prb,overlap); strncat(new_chastity,up_chastity,overlap); - strncat(new_seq,new_down_seq+fit,read_size-overlap); - strncat(new_prb,down_prb+fit,read_size-overlap); - strncat(new_cal_prb,down_cal_prb+fit,read_size-overlap); - strncat(new_chastity,down_chastity+fit,read_size-overlap); - - fprintf(out_fs,"%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\n", - new_chr,new_strand,new_seq,splitpos,read_size,new_prb,new_cal_prb,new_chastity,gene_id,exon_idx); + //strncat(new_seq,new_down_seq+fit,read_size-overlap); + //strncat(new_prb,down_prb+fit,read_size-overlap); + //strncat(new_cal_prb,down_cal_prb+fit,read_size-overlap); + //strncat(new_chastity,down_chastity+fit,read_size-overlap); + + int offset = exon_start-down_pos; + strncat(new_seq,new_down_seq+offset,read_size-overlap); + strncat(new_prb,down_prb+offset,read_size-overlap); + strncat(new_cal_prb,down_cal_prb+offset,read_size-overlap); + strncat(new_chastity,down_chastity+offset,read_size-overlap); + + // The four last integers specify the positions of + // p_start : the position in the dna where the first read starts + // exons_stop : the position in the dna where the first exons ends + // exons_start : the position in the dna where the second exons starts + // p_stop : the position in the dna where the (truncated) second read ends + fprintf(out_fs,"%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\n", + new_chr,new_strand,new_seq,splitpos,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop); combined_reads++; used_flag[down_idx] = 1; @@ -453,6 +468,7 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v free(upstream_line); free(downstream_line); + free(used_flag); free(new_up_seq); free(new_down_seq); @@ -496,8 +512,8 @@ int fitting(char* up_prb, char* down_prb) { didx = uidx+w_size; current_mean_up += up_prb[uidx]-50; current_mean_down += up_prb[didx]-50; - } + current_mean_up /= w_size; current_mean_down /= w_size; @@ -505,8 +521,8 @@ int fitting(char* up_prb, char* down_prb) { didx = uidx+w_size; current_variance_up += pow(up_prb[uidx]-50 - current_mean_up,2); current_variance_down += pow(up_prb[didx]-50 - current_mean_up,2); - } + current_variance_up /= (w_size-1); current_variance_down /= (w_size-1); -- 2.20.1