+++ /dev/null
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import sys
-import os.path
-import pdb
-
-"""
-This explanation was take from the SOAP website ( http://soap.genomics.org.cn/ )
-
-One line for One hit. The columns are separated by '\t'.
-
- 1)id: id of read;
-
- 2)sequence: full sequence of read. the read will be converted to the complementary sequence if mapped on the
- reverse chain of reference;
-
- 3)quality: quality of sequence. corresponding to sequence, to be consistent with seq, it will be converted
- too if mapped on reverse chain;
-
- 4)num. of hits: number of equal best hits. the reads with no hits will be ignored;
-
- 5)a/b: flag only meaningful for pair-end alignment, to distinguish which file the read is belonging to;
-
- 6)length: length of the read, if aligned after trimming, it will report the information of trimmed read;
-
- 7)+/-: alignment on the direct(+) or reverse(-) chain of the reference;
-
- 8)chr: id of reference sequence;
-
- 9)location: location of first bp on the reference, counted from 1;
-
- 10)types: type of hits.
-
- "0": exact match
- "1~100 RefAllele->OffsetQueryAlleleQual": number of mismatches, followed by detailed mutation sites and
- switch of allele types. Offset is relative to the initial location on reference. 'OffsetAlleleQual': offset,
- allele, and quality.
-
- Example: "2 A->10T30 C->13A32" means there are two mismatches, one on location+10 of
- reference, and the other on location+13 of reference. The allele on reference is A and C respectively, while
- query allele type and its quality is T,30 and A,32.
-
- "100+n Offset": n-bp insertion on read. Example: "101 15" means 1-bp insertion on read, start after
- location+15 on reference.
-
- "200+n Offset": n-bp deletion on read. Example: "202 16" means 2-bp deletion on query, start after 16bp on reference
-"""
-
-class SOAPParser:
-
- def __init__(self,filename):
- assert os.path.exists(filename)
- self.fh = open(filename)
-
- def parseLine(self,line):
- """
- We assume that a line has entries as described above.
- """
-
- line = line.strip()
- id,seq,q,num_hits,flag,length,align_strand,chrom,loc,type = line.split('\t')[:10]
- sites = line.split('\t')[10:]
- id = int(id)
-
- seq=seq.lower()
-
- line_d = {'id':id, 'seq':seq, 'num_hits':num_hits, 'flag':flag,\
- 'length':length, 'align_strand':align_strand, 'chrom':chrom,\
- 'location':loc, 'type':type, 'sites':sites }
-
- return line_d
-
-
- def parse(self):
- entries = {}
-
- for elem in self.fh:
- line_d = self.parseLine(elem)
- id = line_d['id']
- try:
- entries[id] = [line_d]
- except:
- old_entry = entries[id]
- old_entry.append(line_d)
- entries[id] = old_entry
-
- return entries
-
-
-if __name__ == '__main__':
- soap_p = SOAPParser('/fml/ag-raetsch/home/fabio/projects/soap_1.07/fabio_out_s8_g3_w20.soap')
- soap_p.parse()
#!/usr/bin/env python
# -*- coding: utf-8 -*-
+#
+# The purpose of this script is to check the created dataset pickle files for
+# consistency before doing any kind of training/prediction on the data.
+#
+#
+
import sys
import pdb
import cPickle
-from compile_dataset import compile_d,get_seq_and_scores,get_seq
-from qpalma_main import unbracket_est
+from compile_dataset import get_seq_and_scores
+
def checkAll(filename):
dataset = cPickle.load(open(filename))
- [SeqInfo, Exons, OriginalEsts, Qualities, AlternativeSequences] = dataset
- for idx in range(len(SeqInfo)):
+ for idx,(current_key,current_value) in enumerate(dataset.iteritems()):
+ (currentSeqInfo,original_est,currentQualities) = current_value
+ currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
+ currentQualities = (prb,cal_prb,chastity)
+
if idx > 0 and idx % 250 == 0:
print 'processed %d examples' % idx
if __name__ == '__main__':
- checkAll(sys.argv[1])
+ dataset_fn = sys.argv[1]
+ status,mes = checkAll(dataset_fn )
+
+ if status == True:
+ print 'Dataset %s seems to be consistent.' % dataset_fn
+ else:
+ print 'Dataset %s seems to be inconsistent!' % dataset_fn
+ print mes
+++ /dev/null
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import sys
-import os
-import pdb
-import io_pickle
-
-def evaluate(dataset_fn,prediction_fn):
- dataset = io_pickle.load(dataset_fn)
- prediction = io_pickle.load(prediction_fn)
-
- data = io_pickle.load(filename)
-
- Sequences = data['Sequences']
- Acceptors = data['Acceptors']
- Donors = data['Donors']
- Exons = data['Exons']
- Ests = data['Ests']
- Qualities = data['Qualities']
- SplitPositions = data['SplitPositions']
-
-
-
-if __name__ == '__main__':
- dataset_fn = sys.argv[1]
- prediction_fn = sys.argv[2]
-
- evaluate(dataset_fn,prediction_fn)
+++ /dev/null
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import sys
-import pdb
-import subprocess
-import os.path
-
-jp=os.path.join
-
-class Remapping:
- """
- This class wraps the use of SOAP - "Short Oligonucleotide Alignment Program"
- ( http://soap.genomics.org.cn/ )
-
- """
-
- soap_binary = '/fml/ag-raetsch/home/fabio/projects/soap_1.07/soap'
-
- def __init__(self,reads,chromos,result_d):
- self.reads = reads
- self.chromosomes = chromos
- self.result_dir = result_d
-
- self.max_mismatch = 1
- self.max_equal_hits = 100
- self.max_gap_size = 6000
- self.seed_size = 5
-
-
- def performRemapping(self):
- for currentChr,chromId in self.chromosomes:
- result_filename ='soap_r2_v%d_w%d_g%d_s%d_%s.result' %\
- (self.max_mismatch,self.max_equal_hits,self.max_gap_size,self.seed_size,chromId)
-
- result = jp(self.result_dir,result_filename)
-
- cmd = '%s -a %s -d %s -o %s' % (self.soap_binary,self.reads,currentChr,result)
- cmd +=' -r 2 -v %d -w %d -g %d -s %d' %\
- (self.max_mismatch,self.max_equal_hits,self.max_gap_size,self.seed_size)
-
- #os.system('echo \"%s\" | qsub -l h_vmem=5.0G -cwd -j y -N \"%s.log\"'%(cmd,chromId))
- obj = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
- out,err = obj.communicate()
- print out
- print err
-
-
-if __name__ == '__main__':
- reads='/fml/ag-raetsch/share/projects/qpalma/solexa/allFilteredReads.fa'
- # we expect a list of tuples where each tuple consists of a chromosome dna flat
- # file location and a descriptor to name the result file for this chromosome
- dir = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
- chromo = [jp(dir,elem) for elem in [('chr%d.dna.fa'%idx) for idx in range(1,6)]]
- ids = ['chr%d'%idx for idx in range(1,6)]
- chromosomes = zip(chromo,ids)
- #chromosomes=[('/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/chr1.dna.fa','chr1')]
- result_dir = '/fml/ag-raetsch/share/projects/qpalma/solexa/soap_results/'
-
- remap1 = Remapping(reads,chromosomes,result_dir)
- remap1.performRemapping()
+++ /dev/null
-#!/bin/bash
-
-run_dir=/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/run_enable_quality_scores_+_enable_splice_signals_+_enable_intron_length_+
-data_dir=/fml/ag-raetsch/home/fabio/tmp/transcriptome_data
-
-python PipelineHeuristic.py $data_dir/map.vm $run_dir/param_526.pickle $run_dir/run_obj.pickle $data_dir/spliced.heuristic $data_dir/unspliced.heuristic