+ added two scripts for the psl2gff step
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Mon, 16 Jun 2008 14:48:38 +0000 (14:48 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Mon, 16 Jun 2008 14:48:38 +0000 (14:48 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@9622 e1793c9e-67f9-0310-80fc-b846ff1f7b36

scripts/est2gff.sh [new file with mode: 0755]
scripts/est2gff_pipeline.sh [new file with mode: 0755]
scripts/grid_predict.py

diff --git a/scripts/est2gff.sh b/scripts/est2gff.sh
new file mode 100755 (executable)
index 0000000..0a3270d
--- /dev/null
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+g_config=/fml/ag-raetsch/share/databases/genomes/A_thaliana/arabidopsis_tair7/genebuild/genome.config
+
+# First we count the numbers of incorrect gt and gc positions for the full
+# alignment set not filtered by coverage numbers etc.
+
+alignment_file=/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/full.align.consistent.unique
+
+touch gt_dpscores.hist && rm gt_dpscores.hist
+touch gc_dpscores.hist && rm gc_dpscores.hist
+
+for CHR in "CHR1" "CHR2" "CHR3" "CHR4" "CHR5"
+do
+   /fml/ag-raetsch/home/fabio/svn/projects/splicing/gff/bin/psl2gff EST $alignment_file $CHR + psl2gff.result psl2gff_2.result $g_config \
+   | grep -A1 '###' | grep -B1 'gt..\.' | grep check_psl >> gt_dpscores.hist
+   #| grep -A1 '###' | grep -B1 'gt..\.' | grep check_psl  |cut -f2 -d '(' | cut -f1 -d ')' >> gt_dpscores.hist
+
+   /fml/ag-raetsch/home/fabio/svn/projects/splicing/gff/bin/psl2gff EST $alignment_file $CHR + psl2gff.result psl2gff_2.result $g_config \
+   | grep -A1 '###' | grep -B1 'gc..\.' | grep check_psl >> gc_dpscores.hist
+   #| grep -A1 '###' | grep -B1 'gc..\.' | grep check_psl  |cut -f2 -d '(' | cut -f1 -d ')' >> gc_dpscores.hist
+done
+
+# Now we determine the numbers of incorrect gt and gc positions for the full
+# alignment set FILTERED by coverage numbers.
+
+alignment_file=/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/full.align.coverage_filtered
+
+touch gt_dpscores.hist.coverage_filtered && rm gt_dpscores.hist.coverage_filtered 
+touch gc_dpscores.hist.coverage_filtered && rm gc_dpscores.hist.coverage_filtered 
+
+for CHR in "CHR1" "CHR2" "CHR3" "CHR4" "CHR5"
+do
+   /fml/ag-raetsch/home/fabio/svn/projects/splicing/gff/bin/psl2gff EST $alignment_file $CHR + psl2gff.result psl2gff_2.result $g_config \
+   | grep -A1 '###' | grep -B1 'gt..\.' | grep check_psl >> gt_dpscores.hist.coverage_filtered 
+   #| grep -A1 '###' | grep -B1 'gt..\.' | grep check_psl  |cut -f2 -d '(' | cut -f1 -d ')' >> gt_dpscores.hist.coverage_filtered 
+
+   /fml/ag-raetsch/home/fabio/svn/projects/splicing/gff/bin/psl2gff EST $alignment_file $CHR + psl2gff.result psl2gff_2.result $g_config \
+   | grep -A1 '###' | grep -B1 'gc..\.' | grep check_psl >> gc_dpscores.hist.coverage_filtered 
+   #| grep -A1 '###' | grep -B1 'gc..\.' | grep check_psl  |cut -f2 -d '(' | cut -f1 -d ')' >> gc_dpscores.hist.coverage_filtered 
+done
+
+cat gt_dpscores.hist | sort > gt_dpscores.hist.sorted
+cat gt_dpscores.hist.coverage_filtered | sort > gt_dpscores.hist.coverage_filtered.sorted
+diff  --suppress-common-lines gt_dpscores.hist.coverage_filtered.sorted gt_dpscores.hist.sorted > DIFF
diff --git a/scripts/est2gff_pipeline.sh b/scripts/est2gff_pipeline.sh
new file mode 100755 (executable)
index 0000000..ec98957
--- /dev/null
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+#
+# This script takes the alignment output of QPalma and uses the script psl2gff
+# to cluster the reads together in order to infer gene structures.
+# 
+# The result is a (are several) gff file(s) that are used as input to the
+# compute_splicegraph script of the genebuild project.
+#
+
+g_config=/fml/ag-raetsch/share/databases/genomes/A_thaliana/arabidopsis_tair7/genebuild/genome.config
+
+# First we count the numbers of incorrect gt and gc positions for the full
+# alignment set not filtered by coverage numbers etc.
+
+alignment_file=/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/full.align.consistent.unique
+
+for CHR in "CHR1" "CHR2" "CHR3" "CHR4" "CHR5"
+do
+   for STRAND in "+" "-"
+   do
+      echo $CHR $STRAND
+      result1_fn=/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/SpliceGraphResults/gff/myexons_${CHR}${STRAND}.gff
+      result2_fn=/dev/null
+   
+      /fml/ag-raetsch/home/fabio/svn/projects/splicing/gff/bin/psl2gff EST $alignment_file $CHR ${STRAND} $result1_fn $result2_fn $g_config 1>>LOG
+   done
+done
index 2afeb7c..3d962f7 100644 (file)
@@ -45,7 +45,7 @@ def makeJobs(run,dataset_fn,chunks,param):
 
    for c_name,current_chunk in chunks:
       current_job = KybJob(grid_predict.g_predict,[run,dataset_fn,current_chunk,param,c_name])
-      current_job.h_vmem = '12.0G'
+      current_job.h_vmem = '30.0G'
       #current_job.express = 'True'
 
       print "job #1: ", current_job.nativeSpecification
@@ -62,9 +62,11 @@ def create_and_submit():
 
    jp = os.path.join
 
-   run_dir = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/run_enable_quality_scores_+_enable_splice_signals_+_enable_intron_length_+'
+   run_dir = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/saved_run'
 
    run   = cPickle.load(open(jp(run_dir,'run_obj.pickle')))
+   run['name'] = 'saved_run'
+
    param = cPickle.load(open(jp(run_dir,'param_526.pickle')))
 
    dataset_fn           = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/dataset_transcriptome_run_1.pickle'