From 2b141017e4d2f53a4106b46219f1ff60c8986aa8 Mon Sep 17 00:00:00 2001 From: fabio Date: Wed, 18 Jun 2008 12:59:28 +0000 Subject: [PATCH] + added alignment reconstruction function + add script for est2gff processing git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@9653 e1793c9e-67f9-0310-80fc-b846ff1f7b36 --- scripts/Utils.py | 3 +- scripts/createAlignmentFileFromPrediction.py | 96 +++++++++++++++++--- scripts/est2gff_pipeline.sh | 2 +- scripts/grid_alignment.py | 18 +++- scripts/qpalma_main.py | 4 +- 5 files changed, 104 insertions(+), 19 deletions(-) diff --git a/scripts/Utils.py b/scripts/Utils.py index 74d83a4..1c3c4f7 100644 --- a/scripts/Utils.py +++ b/scripts/Utils.py @@ -131,7 +131,8 @@ def pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array): dna_array[idx] = translation[int(dna_array[idx])] est_array[idx] = translation[int(est_array[idx])] except: - pdb.set_trace() + #pdb.set_trace() + pass line1 = "".join(dna_array) line2 = comparison diff --git a/scripts/createAlignmentFileFromPrediction.py b/scripts/createAlignmentFileFromPrediction.py index 54d3beb..922dc70 100644 --- a/scripts/createAlignmentFileFromPrediction.py +++ b/scripts/createAlignmentFileFromPrediction.py @@ -11,6 +11,68 @@ import numpy import qparser +from Utils import pprint_alignment + +import palma.palma_utils as pu +from palma.output_formating import print_results + + +def alignment_reconstruct(current_prediction,num_exons): + """ + We reconstruct the exact alignment to infer the positions of indels and the + sizes of the respective exons. + """ + + SpliceAlign = current_prediction['spliceAlign'] + estAlign = current_prediction['estAlign'] + + dna_array = current_prediction['dna_array'] + est_array = current_prediction['est_array'] + + dna_array = "".join(map(lambda x: str(x),dna_array)) + est_array = "".join(map(lambda x: str(x),est_array)) + + translation = '-ACGTN_' #how aligned est and dna sequence is displayed + #(gap_char, 5 nucleotides, intron_char) + + # this array hold a number for each exons indicating its number of matches + exonIdentities = [0]*num_exons + exonGaps = [0]*num_exons + + est_array = map(lambda x: translation[int(x)],est_array) + dna_array = map(lambda x: translation[int(x)],dna_array) + + gaps = 0 + identity = 0 + idx = 0 + + #if num_exons > 1: + # pdb.set_trace() + + for ridx in range(len(est_array)): + read_elem = est_array[ridx] + dna_elem = dna_array[ridx] + + if ridx > 0 and est_array[ridx-1] != '_' and est_array[ridx] == '_': + idx += 1 + gaps = 0 + identity = 0 + + if read_elem == '_': + continue + + if read_elem == dna_elem: + identity += 1 + + if read_elem == '-': + gaps += 1 + + exonIdentities[idx] = identity + exonGaps[idx] = gaps + + return exonIdentities,exonGaps + + def create_alignment_file(current_loc,out_fname): import numpy #qparser.parse_reads(filtered_reads) @@ -18,6 +80,7 @@ def create_alignment_file(current_loc,out_fname): out_fh = open(out_fname,'w+') allPredictions = cPickle.load(open(current_loc)) + #allPredictions = current_loc allPositions = {} @@ -45,12 +108,12 @@ def create_alignment_file(current_loc,out_fname): predExons = current_prediction['predExons'] predExons = [e+start_pos for e in predExons] - (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\ tExonSizes,tStarts, tEnds) = alignment - EST2GFF = False + new_line = '' + if EST2GFF: predExons = numpy.mat(predExons).reshape(len(predExons)/2,2) @@ -74,7 +137,6 @@ def create_alignment_file(current_loc,out_fname): # &strand, Qname, &Qsize, # &Qstart, &Qend, Tname, &Tsize, # &Tstart, &Tend, &blockcnt,exonsizes_str,Qstarts_str,Tstarts_str - # # ATTENTION # @@ -100,25 +162,31 @@ def create_alignment_file(current_loc,out_fname): Qstarts_str = str(qStarts)[1:-1].replace(' ','') Tstarts_str = str(map(lambda x: x+start_pos,tStarts))[1:-1].replace(' ','') - try: - new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\n' %\ + new_line = "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%c\t%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n"\ + % (match, mismatch, repmatch, Ns, Qgapcnt, Qgapbs,\ + Tgapcnt, Tgapbs, strand, Qname, Qsize, Qstart, Qend,\ + Tname, Tsize, Tstart, Tend,\ + blockcnt,exonsizes_str,Qstarts_str,Tstarts_str) #str(predExons)) + + else: + + num_exons = len(qExonSizes) + exonIdentities,exonGaps = alignment_reconstruct(current_prediction,num_exons) + + pp = lambda x : str(x)[1:-1].replace(' ','') + + new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %\ (id,chromo,strand,seq,str(q1)[1:-1],start_pos,qStart,qEnd,tStart,tEnd,num_exons,\ str(qExonSizes)[1:-1].replace(' ',''),str(qStarts)[1:-1].replace(' ',''),\ str(qEnds)[1:-1].replace(' ',''),str(tExonSizes)[1:-1].replace(' ',''),\ - str(tStarts)[1:-1].replace(' ',''),str(tEnds)[1:-1].replace(' ','')) - except: - pdb.set_trace() - - #new_line = "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%c\t%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n"\ - #% (match, mismatch, repmatch, Ns, Qgapcnt, Qgapbs,\ - #Tgapcnt, Tgapbs, strand, Qname, Qsize, Qstart, Qend,\ - #Tname, Tsize, Tstart, Tend,\ - #blockcnt,exonsizes_str,Qstarts_str,Tstarts_str) #str(predExons)) + str(tStarts)[1:-1].replace(' ',''),str(tEnds)[1:-1].replace(' ',''),\ + str(exonIdentities)[1:-1].replace(' ',''),str(exonGaps)[1:-1].replace(' ','')) out_fh.write(new_line) return allPositions + if __name__ == '__main__': current_dir = sys.argv[1] out_fh = sys.argv[2] diff --git a/scripts/est2gff_pipeline.sh b/scripts/est2gff_pipeline.sh index ec98957..926df1d 100755 --- a/scripts/est2gff_pipeline.sh +++ b/scripts/est2gff_pipeline.sh @@ -20,7 +20,7 @@ do for STRAND in "+" "-" do echo $CHR $STRAND - result1_fn=/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/SpliceGraphResults/gff/myexons_${CHR}${STRAND}.gff + result1_fn=/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/SpliceGraphResults/gff/test_myexons_${CHR}${STRAND}.gff result2_fn=/dev/null /fml/ag-raetsch/home/fabio/svn/projects/splicing/gff/bin/psl2gff EST $alignment_file $CHR ${STRAND} $result1_fn $result2_fn $g_config 1>>LOG diff --git a/scripts/grid_alignment.py b/scripts/grid_alignment.py index c2b8cff..0959c5d 100644 --- a/scripts/grid_alignment.py +++ b/scripts/grid_alignment.py @@ -24,7 +24,9 @@ def g_alignment(chunk_fn,result_fn): def create_and_submit(): jp = os.path.join - run_dir = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/run_enable_quality_scores_+_enable_splice_signals_+_enable_intron_length_+' + #run_dir = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/run_enable_quality_scores_+_enable_splice_signals_+_enable_intron_length_+' + run_dir = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/saved_run' + data_dir = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data' chunks_fn = [] @@ -32,6 +34,20 @@ def create_and_submit(): if fn.startswith('chunk'): chunks_fn.append(fn) + + chunks_fn = [\ + 'chunk_17.predictions.pickle',\ + 'chunk_18.predictions.pickle',\ + 'chunk_19.predictions.pickle',\ + 'chunk_20.predictions.pickle',\ + 'chunk_21.predictions.pickle',\ + 'chunk_23.predictions.pickle',\ + 'chunk_24.predictions.pickle' + ] + + print chunks_fn + + functionJobs=[] for chunk_fn in chunks_fn: diff --git a/scripts/qpalma_main.py b/scripts/qpalma_main.py index cf97fb4..d0a17ad 100644 --- a/scripts/qpalma_main.py +++ b/scripts/qpalma_main.py @@ -742,8 +742,8 @@ class QPalma: alignment = get_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array) #(qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds, tExonSizes, tStarts, tEnds) - dna_array = array.array('c',dna_array) - est_array = array.array('c',est_array) + dna_array = array.array('B',dna_array) + est_array = array.array('B',est_array) #line1,line2,line3 = pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array) #self.plog(line1+'\n') -- 2.20.1