2 # -*- coding: utf-8 -*-
12 You have to supply two files. One containing the gff information and the other
13 one containing the information of the Solexa(R) reads.
15 Usage: ./createTestSet.py gff.pickle reads.txt
20 Make sure that you split the data into chromosome files beforehand because
21 this method does not check the chromosome info
25 print "".join([elem
+' ' for elem
in row
]).strip()
27 def check(annotation
,reads
):
28 skippedReadsCounter
= 0
30 reader
= csv
.reader(reads
, delimiter
='\t', quoting
=csv
.QUOTE_NONE
)
31 for pos
,row
in enumerate(reader
):
33 skippedReadsCounter
+= 1
37 print >> sys
.stderr
, pos
44 for gene
in annotation
:
45 # read inside current gene positions
47 if gene
.start
<= readStart
and pos
+35 <= gene
.stop
:
50 for exonStart
,exonStop
in gene
.exons
:
51 if exonStart
<= readStart
and readStop
<= exonStop
:
59 # read is overlapping with gene boundaries
60 if pos
< gene
.start
and pos
+35 > gene
.start
:
64 if pos
< gene
.stop
and pos
+35 > gene
.stop
:
68 print skippedReadsCounter
70 if __name__
== '__main__':
71 assert len(sys
.argv
) >= 3, info
72 annotFile
= sys
.argv
[1]
73 readsFile
= sys
.argv
[2]
74 assert os
.path
.exists(annotFile
), 'File %s does not exist!' % annotFile
75 assert os
.path
.exists(readsFile
), 'File %s does not exist!' % readsFile
76 print >> sys
.stderr
, doc
78 annotation
= cPickle
.load(open(annotFile
))
79 reads
= open(readsFile
)
81 check(annotation
,reads
)