From: fabio Date: Mon, 4 Feb 2008 10:11:34 +0000 (+0000) Subject: + modified filtering of reads X-Git-Url: http://git.tuebingen.mpg.de/?p=qpalma.git;a=commitdiff_plain;h=e357e548f037bd36b908f96814baeeabd2a83b8a + modified filtering of reads + extended dataset generation script + rearranged loading functions for qpalma training git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@7678 e1793c9e-67f9-0310-80fc-b846ff1f7b36 --- diff --git a/qpalma/Configuration.py b/qpalma/Configuration.py index e613511..a4c2876 100644 --- a/qpalma/Configuration.py +++ b/qpalma/Configuration.py @@ -436,6 +436,9 @@ annot_path = joinp(data_path,'annotation_data') original_path = joinp(data_path,'original_solexa_data') joinp(annot_path,'TAIR7_GFF3_genes_Chr1.gff_v1') + + +data_filename = '' ############################################################################### # # SANITY CHECKS diff --git a/qpalma/DataProc.py b/qpalma/DataProc.py index 5d02c6b..8db88d3 100644 --- a/qpalma/DataProc.py +++ b/qpalma/DataProc.py @@ -13,6 +13,20 @@ import sys from genome_utils import load_genomic +def paths_load_data(filename,expt,genome_info,PAR,test=False): + + data = io_pickle.load(filename) + + Sequences = data['Sequences'] + Acceptors = data['Acceptors'] + Donors = data['Donors'] + Exons = data['Exons'] + Ests = data['Ests'] + Qualities = data['Qualities'] + SplitPositions = data['SplitPositions'] + + return Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPositions + def paths_load_data_solexa(expt,genome_info,PAR,test=False): """ @@ -254,63 +268,3 @@ def paths_load_data_pickle(expt,genome_info,PAR): Exons -= 1 return Sequences, Acceptors, Donors, Exons, Ests, Noises - -def paths_load_data(expt,genome_info,PAR): - """ - - """ - - # function [Sequences, Acceptors, Donors, Exons, Ests, Noises] = paths_load_data(expt,genome_info,PAR) - # Load the relevant file and return the alignment data - - # expt can be 'training','validation' or 'test' - - assert expt in ['training','validation','test'] - - tmp_dir = '/fml/ag-raetsch/home/fabio/tmp' - - Noises = []; - - if expt == 'training': - if PAR.microexon: - if PAR.LOCAL_ALIGN: # local version - - train_data = '%s/microexon_train_data_cut_local.mat' % genome_info.basedir - train_data = '%s/microexon_train_data.mat' % genome_info.basedir - #train_data_pickle = '%s/microexon_train_data_cut_local.pickle'% tmp_dir - #io_pickle.convert_v6(train_data,train_data_pickle) - #train_data = io_pickle.load(train_data_pickle) - data = scipy.io.loadmat(train_data) - - else: # global version - - train_data = '%s/microexon_train_data_cut_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat' %\ - (genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob) - - train_data = '%s/microexon_train_data.mat' % genome_info.basedir - #train_data_pickle = '%s/microexon_train_data_cut_ip=%1.3f_dp=%1.3f_mp=%1.3f.pickle' %\ - # (tmp_dir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob) - - #io_pickle.convert_v6(train_data,train_data_pickle) - #train_data = io_pickle.load(train_data_pickle) - data = scipy.io.loadmat(train_data) - Noises = data['TrainNoise'] # substitution matrix - - else: - train_data = '%s/exons_train_local.mat' % genome_info.basedir - #train_data_pickle = '%s/exons_train_local.pickle'% tmp_dir - #io_pickle.convert_v6(train_data,train_data_pickle) - #microexon_train_data = io_pickle.load(train_data_pickle) - data = scipy.io.loadmat(train_data) - - #print 'train_data is %s' % train_data - - Sequences = data['Train'] # dna sequences - Acceptors = data['TrainAcc'] # acceptor scores - Donors = data['TrainDon'] # donor scores - Exons = data['TrainExon'] # exon boundaries - Ests = data['TrainEsts'] # est sequences - - Exons -= 1 - - return Sequences, Acceptors, Donors, Exons, Ests, Noises diff --git a/qpalma/parsers.py b/qpalma/parsers.py index 072debd..81ec5b7 100644 --- a/qpalma/parsers.py +++ b/qpalma/parsers.py @@ -15,6 +15,13 @@ class ReadParser: def next(self): pass + def parse(self): + lines = [] + + for elem in self: + lines.append(elem) + + return lines class FilteredReadParser(ReadParser): """ @@ -27,15 +34,33 @@ class FilteredReadParser(ReadParser): def parseLine(self,line): """ - We assume that a line has the following entires: + We assume that a line has the following entries: + + read_nr,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop """ - return id,chr,pos,strand,mismatches,align_len,offset,alignment = line.split() + id,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split() + splitpos = int(splitpos) + read_size = int(read_size) + + prb = [ord(elem)-50 for elem in prb] + cal = [ord(elem)-64 for elem in cal] + chastity = [ord(elem)+10 for elem in chastity] + + p_start = int(p_start) + exons_stop = int(exon_stop) + exon_start = int(exon_start) + p_stop = int(p_stop) + + line_d = {'id':id, 'chr':chr, 'strand':strand, 'seq':seq, 'splitpos':splitpos,\ + 'read_size':read_size, 'prb':prb, 'cal_prb':cal_prb, 'chastity':chastity, 'gene_id':gene_id,\ + 'p_start':p_start, 'exon_stop':exon_stop, 'exon_start':exon_start, 'p_stop':p_stop} + return line_d def next(self): for line in self.fh: line = line.strip() - yield self.parseLine + yield self.parseLine(line) raise StopIteration @@ -44,6 +69,11 @@ class RemappedReadParser(ReadParser): """ This class offers a parser for the reads that are remapped by the vmatch utility. + + According to the docu the entries are: + + ID, Chromosome, Position, Orientation (D or P), Mismatches, Alignment length, Offset, Alignment + """ def __init__(self,filename): @@ -54,11 +84,19 @@ class RemappedReadParser(ReadParser): We assume that a line has the following entires: """ - return id,chr,pos,strand,mismatches,align_len,offset,alignment = line.split() + id,chr,pos,strand,mismatches,align_len,offset,alignment = line.split() + pos = int(pos) + mismatches = int(mismatches) + align_len = int(align_len) + offset = int(offset) + line_d = {'id':id, 'chr':chr, 'pos':pos, 'strand':strand, 'mismatches':mismatches, 'align_len':align_len,\ + 'offset':offset, 'alignment':alignment} + return line_d def next(self): for line in self.fh: line = line.strip() - yield self.parseLine + yield self.parseLine(line) raise StopIteration + diff --git a/scripts/compile_dataset.py b/scripts/compile_dataset.py index 88596e8..51c5d9d 100644 --- a/scripts/compile_dataset.py +++ b/scripts/compile_dataset.py @@ -45,7 +45,7 @@ info = """ - 'remappped_pos' - the position of the remapped read """ -def compile_d(gff_file,dna_flat_files,solexa_reads,remapped_reads,tmp_dir,dataset_file): +def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,dataset_file): assert os.path.exists(gff_file) for file in dna_flat_files: @@ -64,6 +64,13 @@ def compile_d(gff_file,dna_flat_files,solexa_reads,remapped_reads,tmp_dir,datase dna_filename = Conf.dna_filename est_filename = Conf.est_filename + frp = FilteredReadParser(filtered_reads) + all_filtered_reads = frp.parse() + + rrp = RemappedReadParser(remapped_reads) + all_remapped_reads = rrp.parse() + + # this stores the new dataset Sequences = [] Acceptors = [] Donors = [] @@ -71,199 +78,182 @@ def compile_d(gff_file,dna_flat_files,solexa_reads,remapped_reads,tmp_dir,datase Ests = [] Qualities = [] SplitPositions = [] + + # Iterate over all remapped reads in order to generate for each read an + # training / prediction example + for reRead in all_remapped_reads: + currentId = reRead['id'] + fRead = all_filtered_reads[currentId] + currentGene = allGenes[gene_id] - frp = FilteredReadParser('') - rrp = RemappedReadParser('est_filename') + chrom = 'chr1' + genome_file = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/' + sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_1+' + seq, acc, don, exons, est, qual, spos = process_read(reRead,fRead,currentGene,chrom,genome_file,sscore_filename) + + if newExample == None: + continue - # Iterate over all reads - for line in open(est_filename): - line = line.strip() - #chr,strand,seq,splitpos,length,prb,cal,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split() - #chr,strand,seq,splitpos,length,prb,cal,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split() - id,chr,pos,strand,mismatches,align_len,offset,alignment = line.split() + Sequences.append(seq) + Acceptors.append(acc) + Donors.append(don) + Exons.append(exons) + Ests.append(est) + Qualities.append(qual) + SplitPositions.append(spos) - splitpos = int(splitpos) - length = int(length) - prb = [ord(elem)-50 for elem in prb] - cal = [ord(elem)-64 for elem in cal] - chastity = [ord(elem)+10 for elem in chastity] - assert len(prb) == len(seq) + dataset = {'Sequences':Sequences, 'Acceptors':Acceptors, 'Donors':Donors\ + 'Exons':Exons, 'Ests':Ests, 'Qualities':Qualities, 'SplitPositions':SplitPositions} + # saving new dataset + io_pickle.save(dataset_file,dataset) - currentGene = allGenes[gene_id] - seq = seq.lower() - #try: - # currentSeq = tair7_seq[gene_id+'.1']['sequence'].lower() - #except: - # continue +def process_read(reRead,fRead,currentGene,chrom,genome_file,sscore_filename): + """ + The purpose of this function is to create an example for QPalma + by using a - genome_file = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/' - chrom = 'chr1' + """ - # use matlab-style functions to access dna sequence - dna = load_genomic(chrom,'+',currentGene.start,currentGene.stop,genome_file,one_based=True) - currentSeq = dna + # use matlab-style functions to access dna sequence + dna = load_genomic(chrom,'+',currentGene.start,currentGene.stop,genome_file,one_based=True) + dna = dna.lower() + currentSeq = dna - if len(currentSeq) < (currentGene.stop - currentGene.start): - continue + if len(currentSeq) < (currentGene.stop - currentGene.start): + return None - # compare both sequences - #assert dna == currentSeq, pdb.set_trace() - - p_start = int(p_start) - exon_stop = int(exon_stop) - exon_start = int(exon_start) - p_stop = int(p_stop) - - assert p_start < exon_stop < exon_start < p_stop, 'Invalid Indices' - assert exon_stop - p_start + p_stop - exon_start == 36, 'Invalid read size' - assert p_stop - p_start >= 36 - - currentExons = zeros((2,2)) + p_start = fRead['p_start'] + exon_stop = fRead['exon_stop'] + exon_start = fRead['exon_start'] + p_stop = fRead['p_stop'] - currentExons[0,0] = p_start - currentExons[0,1] = exon_stop + currentSeq = fRead['seq'] - currentExons[1,0] = exon_start - currentExons[1,1] = p_stop + assert p_start < exon_stop < exon_start < p_stop, 'Invalid Indices' + assert exon_stop - p_start + p_stop - exon_start == 36, 'Invalid read size' + assert p_stop - p_start >= 36 - # make indices relative to start pos of the current gene - currentExons -= currentGene.start + currentExons = zeros((2,2)) - # determine cutting positions for sequences - up_cut = int(currentExons[0,0]) - down_cut = int(currentExons[1,1]) + currentExons[0,0] = p_start + currentExons[0,1] = exon_stop - # if we perform testing we cut a much wider region as we want to detect - # how good we perform on a region - if test: - up_cut = up_cut-500 - if up_cut < 0: - up_cut = 0 + currentExons[1,0] = exon_start + currentExons[1,1] = p_stop - down_cut = down_cut+500 + # make indices relative to start pos of the current gene + currentExons -= currentGene.start - if down_cut > len(currentSeq): - down_cut = len(currentSeq) + # determine cutting positions for sequences + up_cut = int(currentExons[0,0]) + down_cut = int(currentExons[1,1]) - # cut out part of the sequence - currentSeq = currentSeq[up_cut:down_cut] + # if we perform testing we cut a much wider region as we want to detect + # how good we perform on a region + if test: + up_cut = up_cut-500 + if up_cut < 0: + up_cut = 0 - # ensure that index is correct - currentExons[0,1] += 1 + down_cut = down_cut+500 - if test: - currentExons -= up_cut - else: - currentExons -= currentExons[0,0] + if down_cut > len(currentSeq): + down_cut = len(currentSeq) - try: - if not (currentSeq[int(currentExons[0,1])] == 'g' and\ - currentSeq[int(currentExons[0,1])+1] in ['c','t' ]): - continue + # cut out part of the sequence + currentSeq = currentSeq[up_cut:down_cut] - if not (currentSeq[int(currentExons[1,0])-1] == 'g' and currentSeq[int(currentExons[1,0])-2] == 'a'): - continue + # ensure that index is correct + currentExons[0,1] += 1 - except: - pdb.set_trace() + if test: + currentExons -= up_cut + else: + currentExons -= currentExons[0,0] - Sequences.append(currentSeq) + try: + if not (currentSeq[int(currentExons[0,1])] == 'g' and\ + currentSeq[int(currentExons[0,1])+1] in ['c','t' ]): + continue - # now we want to use interval_query to get the predicted splice scores - # trained on the TAIR sequence and annotation + if not (currentSeq[int(currentExons[1,0])-1] == 'g' and currentSeq[int(currentExons[1,0])-2] == 'a'): + continue - from Genefinding import * - - interval_matrix = createIntArrayFromList([currentGene.start+up_cut,currentGene.start+down_cut]) - num_fields = 1 - num_intervals = 1 - - # fetch acceptor scores - fname = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/acc/output_SVMWD/pred/contig_1+' - gf = Genef() - num_hits = gf.query(fname, ['Conf'], num_fields, interval_matrix, num_intervals) - assert num_hits <= len(currentSeq) - - #print 'Acceptor hits: %d' % num_hits + except: + pdb.set_trace() - pos = createIntArrayFromList([0]*num_hits) - indices = createIntArrayFromList([0]*num_hits) - scores = createDoubleArrayFromList([0]*num_hits*num_fields) - gf.getResults(pos,indices,scores) + # now we want to use interval_query to get the predicted splice scores + # trained on the TAIR sequence and annotation + from Genefinding import * + + interval_matrix = createIntArrayFromList([currentGene.start+up_cut,currentGene.start+down_cut]) + num_fields = 1 + num_intervals = 1 + + # fetch acceptor scores + gf = Genef() + num_hits = gf.query(sscore_filename, ['Conf'], num_fields, interval_matrix, num_intervals) + assert num_hits <= len(currentSeq) + + #print 'Acceptor hits: %d' % num_hits - currentAcceptors = [-inf]*(len(currentSeq)+1) - - for i in range(num_hits): + pos = createIntArrayFromList([0]*num_hits) + indices = createIntArrayFromList([0]*num_hits) + scores = createDoubleArrayFromList([0]*num_hits*num_fields) + gf.getResults(pos,indices,scores) + + acc = [-inf]*(len(currentSeq)+1) + + for i in range(num_hits): + position = pos[i] + position -= (currentGene.start+up_cut) + assert 0 <= position < len(currentSeq)+1, 'position index wrong' + acc[position] = scores[i] + + acc = acc[1:] + [-inf] + Acceptors.append(acc) + + del gf + + # fetch donor scores + gf = Genef() + num_hits = gf.query(sscore_filename, ['Conf'], num_fields, interval_matrix, num_intervals) + assert num_hits <= len(currentSeq) + + #print 'Donor hits: %d' % num_hits + pos = createIntArrayFromList([0]*num_hits) + indices = createIntArrayFromList([0]*num_hits) + scores = createDoubleArrayFromList([0]*num_hits*num_fields) + gf.getResults(pos,indices,scores) + + don = [-inf]*(len(currentSeq)) + + try: + for i in range(1,num_hits): position = pos[i] position -= (currentGene.start+up_cut) - assert 0 <= position < len(currentSeq)+1, 'position index wrong' - currentAcceptors[position] = scores[i] - - currentAcceptors = currentAcceptors[1:] + [-inf] - Acceptors.append(currentAcceptors) - - del gf - - # fetch donor scores - fname = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_1+' - gf = Genef() - num_hits = gf.query(fname, ['Conf'], num_fields, interval_matrix, num_intervals) - assert num_hits <= len(currentSeq) - - #print 'Donor hits: %d' % num_hits - pos = createIntArrayFromList([0]*num_hits) - indices = createIntArrayFromList([0]*num_hits) - scores = createDoubleArrayFromList([0]*num_hits*num_fields) - gf.getResults(pos,indices,scores) - - currentDonors = [-inf]*(len(currentSeq)) - - try: - for i in range(1,num_hits): - position = pos[i] - position -= (currentGene.start+up_cut) - currentDonors[position-1] = scores[i] - except: - pdb.set_trace() - - currentDonors = [-inf] + currentDonors[:-1] - Donors.append(currentDonors) - - Exons.append(currentExons) - Ests.append(seq) - Qualities.append(prb) - - SplitPositions.append(splitpos) - - #if len(Sequences[-1]) == 2755: - # Sequences = Sequences[:-1] - # Acceptors = Acceptors[:-1] - # Donors = Donors[:-1] - # Exons = Exons[:-1] - # Ests = Ests[:-1] - # Qualities = Qualities[:-1] - # SplitPositions = SplitPositions[:-1] - - #print 'found %d examples' % len(Sequences) + don[position-1] = scores[i] + except: + pdb.set_trace() - return Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPositions + don = [-inf] + currentDonors[:-1] + return seq, acc, don, currentExons, est, qual, spos - - dataset = [] +def reverse_complement(seq): + map {'a':'t','c':'g','g':'c','t':'a'} - {'id':'', 'chr':'', '', 'strand':, = line.split() + new_seq = [map[elem] for elem in seq] + new_seq.reverse() + return new_seq - io_pickle.save(dataset_file,dataset) - if __name__ == '__main__': - if len(sys.argv) == 1: print info diff --git a/scripts/qpalma_main.py b/scripts/qpalma_main.py index ed94ab1..f5438df 100644 --- a/scripts/qpalma_main.py +++ b/scripts/qpalma_main.py @@ -65,8 +65,10 @@ class QPalma: elif Conf.mode == 'using_quality_scores': - filename = 'real_dset_%s'% 'recent' - Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos = paths_load_data_solexa('training',None,self.ARGS) + #Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos = paths_load_data_solexa('training',None,self.ARGS) + data_filename = Conf.data_filename + + Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos = paths_load_data(data_filename,'training',None,self.ARGS) #end = 1000 self.Sequences = Sequences @@ -74,8 +76,8 @@ class QPalma: self.Ests = Ests self.Qualities = Qualities self.SplitPos = SplitPos - - self.Donors, self.Acceptors = getDonAccScores(Sequences) + self.Donors = Donors + self.Acceptors = Acceptors #self.Acceptors = self.Acceptors[:end] #self.Donors = self.Donors[:end] diff --git a/tools/data_tools/filterReads.c b/tools/data_tools/filterReads.c index 55730cf..ece462a 100644 --- a/tools/data_tools/filterReads.c +++ b/tools/data_tools/filterReads.c @@ -46,6 +46,7 @@ const int read_size = 36; const int min_overlap = 6; const int max_overlap = 30; +int read_nr = 1; int combined_reads = 0; int main(int argc, char* argv[]) { @@ -460,8 +461,9 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v // exons_stop : the position in the dna where the first exons ends // exons_start : the position in the dna where the second exons starts // p_stop : the position in the dna where the (truncated) second read ends - fprintf(out_fs,"%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\n", - new_chr,new_strand,new_seq,splitpos,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop); + fprintf(out_fs,"%08d\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\n", + read_nr,new_chr,new_strand,new_seq,splitpos,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop); + read_nr++; combined_reads++; used_flag[down_idx] = 1;