'''
This file takes as input
-f fasta file
-o outputfile
-p prediction file
-l label
-s prob_cutoff
-m min length
and it extracts dasta seq by label after reading prediction file
'''
from __future__ import division
import sys
import getopt
import math
import re
from Bio import SeqIO
from Bio.Seq import Seq
#set defaults
cutoff=0
min_len=0
max_len=float('Inf')
label='0'
try:
opts, args = getopt.getopt(sys.argv[1:],"hf:o:p:l:s:r:",["ifile=","ofile=","min=","max="])
#print opts
except getopt.GetoptError:
print 'predstoseq.py -f -o -p -l -s -m '
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
print 'Usage:'
print 'predstoseq.py -f -o -p -l -s -m '
sys.exit()
elif opt in ("-f", "--ifile"):
#print 'infile found'
inputfile = arg
elif opt in ("-o", "--ofile"):
outputfile = arg
#print outputfile
elif opt in ("-p"):
predsfile= arg
#print predsfile
elif opt in ("-l"):
label=arg
#print label
elif opt in ("-s"):
#print 's: '+arg
cutoff=float(arg)
#print cutoff
elif opt in ("-r"):
min_len=int(arg)
elif opt in ("--min"):
min_len=int(arg)
#print 'max found'
elif opt in ("--max"):
max_len=int(arg)
#print 'max found'
if cutoff>1 or cutoff<0:
print 'please enter probability value in range [0,1]'
sys.exit()
if max_len<=min_len:
print 'Error: check min and max len'
sys.exit()
if label != '0' and label != '1':
print 'Please check lablel is 0 or 1'
sys.exit()
print '**********Extracting Sequences***************'
print 'class prob cutoff='+ str(cutoff)
print 'min length cutoff='+ str(min_len)
print 'max length cutoff='+ str(max_len)
#label=sys.argv[4]
#open preds file
idlist=[]
with open(predsfile) as f:
content=f.readlines()
for l in content:
if label=='1':
tocheck=float(l.split('\t')[2])
elif label=='0':
tocheck=float(l.split('\t')[3])
else:
print 'check label\nError'
sys.exit(0)
if l.split('\t')[1]==label:
if cutoff <= tocheck :
idlist.append(l.split('\t')[0])
print 'Total sequences in prediction file with label '+label+' and class prob >= '+str(cutoff)+', were: '+str(len(idlist))
ctr=0
min_len_filter=0
max_len_filter=0
#extract seq from fasta
output_handle = open(outputfile, "w")
for record in SeqIO.parse(inputfile, "fasta"):
found=0
seqid=record.id
#print record.seq
if seqid in idlist :
#write to file
#print 'found'
if len(record.seq)>max_len:
max_len_filter=max_len_filter+1
elif len(record.seq) '+str(max_len)+', were: '+str(max_len_filter)
print 'Sequences not found in the fasta file were: '+str(len(idlist)-ctr-min_len_filter-max_len_filter)
if len(idlist)-ctr-min_len_filter-max_len_filter>0:
print 'WARNING:'
print 'Please check input fasta as '+str(len(idlist)-ctr-min_len_filter-max_len_filter)+ ' sequences in the prediction file did not match to any sequences in fasta file'
print 'Total sequences written: '+str(ctr)
print 'File '+outputfile+' saved!'