BLASTer script update

Updated the previous dirty script to add missing features. Now it also makes the reciprocal BLAST for fragments similar to candidate genes.

  1. BLAST each candidate gene against your local database of interest (eg, transcriptome of an organism).
  2. Best 3 loci hits for the gene are BLASTed against the reciprocal candidate genes database (eg, human protein).
  3. Each locus is printed showing the best 5 genes (by gene id) and their e-values from the reciprocal BLAST and a FASTA file with the loci sequences for further analysis.

Installation

  1. Install dependencies.
    • Python 2.7 (not Python 3)
    • Biopython 1.56
    • BLAST+ 2.2.25
  2. Copy the blaster.py to a new folder where it will run.
  3. Make sure you know where the local and reciprocal databases are (or copy them to the folder).

Usage

  1. Convert databases to be BLASTable (if needed).
    makeblastdb -in membranipora.fa -parse_seqids -dbtype nucl
    makeblastdb -in human_protein.fa -parse_seqids -dbtype prot
    
  2. Copy FASTA files with 1 gene sequence per file into the candidates folder.
  3. Run the BLASTer command (database could also be in another folder).
    python blaster.py -d membranipora.fa -b tblastn
    
  4. Results (loci) are printed to 2 files. results.txt shows each locus with reciprocal BLAST output and e-values. results.fa aggregates the loci and their sequences.

Arguments

-c, --candidates — Folder with “.fa” or “.gb” files of candidate genes.
One gene per file.
-d, --database — Local database with new data (eg, transcriptome).
-b, -blast — BLAST command (blastn, blastp, blastx, tblastn, tblastx).

Code

Copy the code below and save it as a new file blaster.py OR download BLASTer.

#!/usr/bin/python
<ol>
	<li>-*- coding: utf-8 -*-</li>
</ol>
'''BLASTing batch script.

Dependencies:
<ul>
<li>Python 2.7.1</li>
<li>Biopython 1.56</li>
<li>BLAST+ 2.2.25</li>
</ul>



Configuration (Linux):
<ul>
<li>Regular packages for Python and Biopython.</li>
<li>Add BLAST+ commands to path putting this code to .bashrc:</li>
</ul>



PATH=$PATH:/home/nelas/Downloads/ncbi-blast-2.2.25+/bin
export PATH

See README file for details or execute the command for usage and arguments:

python blaster.py -h

'''

import getopt
import logging
import os
import pickle
import re
import subprocess
import sys

from Bio import Entrez, SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Blast import NCBIWWW, NCBIXML
from Bio.Blast.Applications import NcbiblastnCommandline, NcbiblastpCommandline, NcbiblastxCommandline, NcbitblastnCommandline, NcbitblastxCommandline

class Sequence(object):
'''Represents a known sequence.

filepath: path to fasta file.
ref: sequence reference number.
description: long description.
sequence: the sequence itself.
gene_id: NCBI gene identification.
gene_name: NCBI gene name.
blast_output: path to .xml file.
loci: list of loci with recurrency?.
'''
Entrez.email = 'your@email.com'
EVALUE_THRESH = 0.001

<ol>
<li>Cache folder.</li>
</ol>


cache_folder = 'seqs'
<ol>
<li>Check if cache folder exists.</li>
</ol>


if not os.path.isdir(cache_folder):
os.mkdir(cache_folder)

#TODO add __str__ to classes.

def __init__(self, filepath=None, ref=None, database=None):
#TODO Make limit a variable variable.
self.limit = 3
self.loci = {}
self.gene_name = ''
self.database = database
if filepath:
self.filepath = filepath
self.parse_fasta(self.filepath)
self.set_gene_id()
self.set_gene_name()
else:
cache_path = os.path.join(self.cache_folder, ref)
try:
cache_file = open(cache_path, 'rb')
cached = pickle.load(cache_file)
self.load_cache(cached)
cache_file.close()
except:
self.fetch(ref)
cache_file = open(cache_path, 'wb')
pickle.dump(self, cache_file)
cache_file.close()

def load_cache(self, cached):
'''Load attributes saved in cache files.'''
self.description = cached.description
self.ref = cached.ref
self.sequence = cached.sequence
self.gene_name = cached.gene_name
self.gene_id = cached.gene_id

def parse_fasta(self, filepath):
'''Parse data from FASTA file.'''
record = SeqIO.read(filepath, 'fasta')
<ol>
<li>Define attributes.</li>
</ol>


self.description = record.description.split('|')[-1]
#XXX Better parse ref!
self.ref = record.id.split('|')[3]
self.sequence = str(record.seq)
self.gene_name = filepath.split('/')[-1][:-3]

def set_gene_id(self):
'''Get and set gene id from NCBI.'''
#import pdb; pdb.set_trace()
#FIXME Handle error when not returning id.
handle = Entrez.esearch(db='gene', term=self.ref)
record = Entrez.read(handle)
if record['IdList']:
self.gene_id = record['IdList'][0]
else:
self.gene_id = None

def set_gene_name(self):
'''Get and set gene name from filepath.'''
filepath = os.path.basename(self.filepath)
self.gene_name = os.path.splitext(filepath)[0]

def fetch(self, ref):
'''Fetch data from NCBI.'''
handle = Entrez.efetch(db='protein', id=ref, rettype='gb')
record = SeqIO.read(handle, 'genbank')
#TODO maybe just save file locally...
handle = Entrez.efetch(db='protein', id=ref, rettype='gb')
handle_string = handle.read()
<ol>
<li>Define attributes.</li>
</ol>


#FIXME See if this is ok!
self.description = record.description
self.ref = record.id
self.sequence = str(record.seq)
self.set_gene_id()
<ol>
<li>Find gene name by regular expression.</li>
</ol>


try:
pattern = re.compile(r'gene="(.+)"')
reobj = pattern.search(handle_string)
self.gene_name = reobj.group(1)
except:
self.gene_name = 'Not found'
<ol>
<li>To extract gene description, name and other info from xml parsing.</li>
</ol>


#&gt;&gt;&gt; record['Bioseq-set_seq-set'][0]['Seq-entry_set']['Bioseq-set']['Bioseq-set_seq-set'][0]['Seq-entry_seq']['Bioseq']['Bioseq_annot'][0]['Seq-annot_data']['Seq-annot_data_ftable'][1]['Seq-feat_data']['SeqFeatData']['SeqFeatData_gene']['Gene-ref']
#&gt;&gt;&gt; {u'Gene-ref_desc': 'DEAD (Asp-Glu-Ala-Asp) box polypeptide 4', u'Gene-ref_syn': ['VASA'], u'Gene-ref_locus': 'DDX4'}

def parse_blast(self):
'''Parse BLAST output file.'''
blast_records = NCBIXML.parse(open(self.blast_output))
n = 0
<ol>
<li>Iterate over BLAST results from database.</li>
</ol>


for blast_record in blast_records:
for alignment in blast_record.alignments:
for hsp in alignment.hsps:
if n &lt; self.limit:
if hsp.expect &lt; self.EVALUE_THRESH:

print 'Instantiating &gt;&gt; %s' % alignment.title
locus_id = alignment.title.split()[0]

<ol>
<li>Save locus to dictionary if key does not exists.</li>
</ol>


#NOTE This means that only the first hsp found in the locus is added,
<ol>
<li>while the other ones are skipped</li>
</ol>


if not locus_id in self.loci.keys():
self.loci[locus_id] = {
'score': hsp.score,
'evalue': hsp.expect
}

n += 1

class Locus(object):
'''Represents a new sequence.

candidate: instance of Candidate.
description: long description.
sequence: the sequence itself.
score: from Candidate BLAST.
evalue: from Candidate BLAST.
reverse_blast_output: reverse BLAST to human protein db.
'''
EVALUE_THRESH = 0.001

<ol>
<li>Reverse BLAST folder.</li>
</ol>


reverse_folder = 'reverse'

def __init__(self, id, candidate, database):
<ol>
<li>Linkback to candidates.</li>
</ol>


self.candidates = {}

self.reciprocals = []

self.update_candidates(candidate)

self.reciprocal = False
self.rank = 0
self.id = id
self.database = database

<ol>
<li>Create filename before the rest.</li>
</ol>


self.set_filename()

self.reverse_results_folder = os.path.join(self.reverse_folder, 'results')

<ol>
<li>Check if reverse folder exists.</li>
</ol>


if not os.path.isdir(self.reverse_folder):
os.mkdir(self.reverse_folder)
<ol>
<li>Check if reverse results folder exists.</li>
</ol>


if not os.path.isdir(self.reverse_results_folder):
os.mkdir(self.reverse_results_folder)

self.set_filepath()
self.set_blast_output()
self.set_sequence()

self.write_fasta()

def update_candidates(self, candidate):
'''Update list of linked candidate genes.'''
if not candidate.gene_name in self.candidates.keys():
self.candidates[candidate.gene_name] = candidate

def set_sequence(self):
'''Get and set sequence from database.'''
parsed = SeqIO.parse(self.database, 'fasta')
for locus in parsed:
if locus.id == self.id:
self.sequence = str(locus.seq)
if not self.sequence:
print 'SEQUENCE NOT SET! Check the IDs.'

def set_blast_output(self):
'''Build filepath for reverse blast output.'''
blast_output = os.path.join(self.reverse_results_folder, '%s.xml' % self.filename)
self.reverse_blast_output = blast_output

def set_filepath(self):
'''Build filepath from folder and filename.'''
locus_filepath = os.path.join(self.reverse_folder, '%s.fa' % self.filename)
self.filepath = locus_filepath

def set_filename(self):
'''Extract filename from ID.'''
filename = '_'.join(self.id.split('_')[:2])
self.filename = filename

def write_fasta(self):
'''Write FASTA file to filepath.'''
#TODO Insert line breaks for sequences.
locus_file = open(self.filepath, 'w')
locus_file.write('&gt;%s\n%s' % (self.id, self.sequence))
locus_file.close()

def parse_blast(self):
'''Parse reciprocal BLAST.'''
print 'Parsing reciprocal BLAST: %s' % self.filename
blast_records = NCBIXML.parse(open(self.reverse_blast_output))
for blast_record in blast_records:
for alignment in blast_record.alignments:
for hsp in alignment.hsps:
if hsp.expect &lt; self.EVALUE_THRESH:
#XXX Better parse ref!
ref = alignment.title.split('|')[3]
#print 'Creating object: %s' % ref
sequence = Sequence(ref=ref)
sequence.evalue = hsp.expect
sequence.score = hsp.score
self.reciprocals.append(sequence)
<ol>
<li>Process results.</li>
</ol>


#self.process()

def process(self):
'''Define if locus is reciprocal to the gene.

True: if it matches the same gene_id and ref from the candidate gene.
'''
#TODO Handle when gene_id is None?
rank = 0
print '\nrank\tgene\tref\t\thit'
for sequence in self.reciprocals:
if sequence.gene_id == current_gene_id:
#DOSTUFF
continue
else:
break
current_gene_id = sequence.gene_id

rank += 1
#if sequence.gene_id == self.candidate.gene_id:
<ol>
<li>if sequence.ref == self.candidate.ref:</li>
<li>print '%d\t%s\t%s\t%s' % (rank, sequence.gene_id, sequence.ref, '+')</li>
<li>print '\nFound reciprocal! Gene: %s, Ref: %s\n' % (sequence.gene_id, sequence.ref)</li>
<li>self.reciprocal = True</li>
<li>self.rank = rank</li>
<li>break</li>
</ol>


print '%d\t%s\t%s\t%s' % (rank, sequence.gene_id, sequence.ref, '-')

def blast(blast_type, arguments):
'''Execute BLAST command.'''
<ol>
<li>Instantiate the BLAST command.</li>
</ol>


if blast_type == 'blastn':
cline = NcbiblastnCommandline(**arguments)
elif blast_type == 'blastp':
cline = NcbiblastpCommandline(**arguments)
elif blast_type == 'blastx':
cline = NcbiblastxCommandline(**arguments)
elif blast_type == 'tblastn':
cline = NcbitblastnCommandline(**arguments)
elif blast_type == 'tblastx':
cline = NcbitblastnCommandline(**arguments)

<ol>
<li>Execute BLAST.</li>
</ol>


stdout, stderr = cline()
print '%s BLASTed!' % arguments['query']

def prepare(candidates, candidates_folder):
'''Check candidate gene files (convert to FASTA, if needed).'''
for gene in candidates:
gene_filepath = os.path.join(candidates_folder, gene)

if gene.endswith('.gb'):
<ol>
<li>Create record object.</li>
</ol>


record = SeqIO.read(gene_filepath, 'genbank')
<ol>
<li>Create FASTA file.</li>
</ol>


try:
fasta_file = open(gene_filepath.split('.')[0] + '.fa', 'r')
except IOError:
fasta_file = open(gene_filepath.split('.')[0] + '.fa', 'w')
fasta_file.write(record.format('fasta'))
fasta_file.close()
elif gene.endswith('.fa'):
continue
else:
logger.debug('File type not supported: %s', gene)

def usage():
'''Explanation for arguments.'''

print
print 'USAGE: ./blaster.py -d Membranipora.fasta -b tblastn'
print
print ' -c, --candidates \n\tFolder with `.fa` or `.gb` files of candidate genes. One gene per file.'
print ' -d, --database \n\tLocal database with new data (eg, transcriptome).'
print ' -b, --blast \n\tBLAST command (blastn, blastp, blastx, tblastn, tblastx).'
print

#TODO Omit the candidates folder option? Should not be needed.
#TODO Ability to specify limit for candidate blast results to be analysed.
#TODO Choose evalue threshold.

def main(argv):

<ol>
<li>Available BLASTs.</li>
</ol>


blast_types = ['blastn', 'blastp', 'blastx', 'tblastn', 'tblastx']
<ol>
<li>Database name.</li>
</ol>


database = None
<ol>
<li>BLAST command.</li>
</ol>


blast_type = None

<ol>
<li>Folder with candidate-genes.</li>
</ol>


candidates_folder = 'candidates'
<ol>
<li>Folder with candidate-genes BLASTs.</li>
</ol>


candidates_results_folder = os.path.join(candidates_folder, 'results')

<ol>
<li>Parse arguments.</li>
</ol>


try:
opts, args = getopt.getopt(argv, 'hc:d:b:', [
'help',
'candidates=',
'database=',
'blast=',
])
except getopt.GetoptError:
usage()
logger.critical('Argument error: "%s". Aborting...',
' '.join(argv))
sys.exit(2)

<ol>
<li>Argument handler.</li>
</ol>


for opt, arg in opts:
if opt in ('-h', '--help'):
usage()
sys.exit()
elif opt in ('-c', '--candidates'):
candidates_folder = arg
elif opt in ('-d', '--database'):
database = os.path.abspath(arg)
elif opt in ('-b', '--blast'):
blast_type = arg

<ol>
<li>Print summary of arguments.</li>
</ol>


logger.debug('Arguments: candidates=%s, database=%s, blast=%s', candidates_folder, database, blast_type)

<ol>
<li>Check if BLAST command was specified.</li>
</ol>


if not blast_type:
logger.critical('BLAST command was not specified (use "-b"). Aborting...')
sys.exit(2)
else:
if not blast_type in blast_types:
logger.critical('Unknown BLAST command: %s', blast_type)
logger.info('Available BLAST commands: %s', ', '.join(blast_types))
sys.exit(2)

<ol>
<li>Get candidate genes.</li>
</ol>


try:
candidates = os.listdir(candidates_folder)
except OSError:
logger.critical('Folder "%s" does not exist! Aborting...', candidates_folder)
sys.exit(2)

<ol>
<li>Issue error if there are no candidate genes.</li>
</ol>


if not candidates:
logger.critical('There are no candidate genes at %s folder! Aborting...',
candidates_folder)
sys.exit(2)

<ol>
<li>Check if results exists.</li>
</ol>


if not os.path.isdir(candidates_results_folder):
os.mkdir(candidates_results_folder)

<ol>
<li>Process candidate genes.</li>
</ol>


prepare(candidates, candidates_folder)
<ol>
<li>Get proper genes, now.</li>
</ol>


candidates = os.listdir(candidates_folder)
<ol>
<li>Only read FASTA files.</li>
</ol>


candidates = [file for file in candidates if file.endswith('.fa')]
#TODO make it recognize more FASTA extensions.

<ol>
<li>Print info before starting.</li>
</ol>


logger.info('%d genes to be BLASTed against %s database!', len(candidates),
database)

<ol>
<li>Main object to store genes and loci.</li>
</ol>


genes = {}
loci = {}

<ol>
<li>BLAST each gene against local database.</li>
</ol>


for genefile in candidates:
print
logger.info('BLASTing %s against %s...', genefile, database)

<ol>
<li>Instantiate Sequence.</li>
</ol>


gene_filepath = os.path.join(candidates_folder, genefile)
candidate = Sequence(filepath=gene_filepath, database=database)

<ol>
<li>Define variables.</li>
</ol>


output_filepath = os.path.join(candidates_results_folder, '%s.xml' % candidate.gene_name)
candidate.blast_output = output_filepath

<ol>
<li>Only BLAST if needed.</li>
</ol>


try:
blastfile = open(candidate.blast_output)
blastfile.close()
except:
arguments = {
'query': candidate.filepath,
'db': database,
'out': candidate.blast_output,
'outfmt': 5, # Export in XML
}
<ol>
<li>Execute BLAST command.</li>
</ol>


blast(blast_type, arguments)

logger.info('Parsing XML...')
print '\nCandidate &gt;&gt; Gene: %s, ID:%s, Ref: %s, Description: %s' % (
candidate.gene_name, candidate.gene_id,
candidate.ref, candidate.description
)
candidate.parse_blast()

#DO LOCUS STUFF
for locus_id in candidate.loci.keys():
<ol>
<li>If locus was already specified.</li>
</ol>


if locus_id in loci.keys():
locus = loci[locus_id]
locus.update_candidates(candidate)
else:
<ol>
<li>Instantiate Locus object.</li>
</ol>


locus = Locus(locus_id, candidate, database)

<ol>
<li>Reciprocal BLAST querying locus sequence against human database.</li>
</ol>


try:
blastfile = open(locus.reverse_blast_output)
blastfile.close()
except:
#XXX Find a better way to overcome problem when path has a "|".
reverse_args = {
'query': locus.filepath.replace('|', '\|'),
'db': '/home/nelas/Biologia/Doutorado/genomic_databases/human_protein.fa',
'out': locus.reverse_blast_output.replace('|', '\|'),
'outfmt': 5,
}
if blast_type == 'tblastn':
blast('blastx', reverse_args)
elif blast_type == 'blastp':
blast('blastp', reverse_args)

<ol>
<li>Parse reverse BLAST output.</li>
</ol>


locus.parse_blast()

<ol>
<li>Add locus to main list.</li>
</ol>


loci[locus_id] = locus

<ol>
<li>Add to main dictionary.</li>
</ol>


genes[candidate.gene_name] = candidate

<ol>
<li>Print loci equivalent to candidate genes.</li>
</ol>


logger.info('Creating result files...')

fasta_loci = []
output_file = open('results.txt', 'w')

#TODO Print date and variables used to results.txt.
for locus_id, locus in loci.iteritems():
fasta_loci.append(SeqRecord(Seq(locus.sequence), id=locus_id,
description='related to: %s' % ', '.join(locus.candidates.keys())))
print
print locus_id, locus.candidates.keys()
output_file.write('%s\t\t(candidates: %s)\n\n' % (locus_id,
', '.join(locus.candidates.keys())))
try:
first = locus.reciprocals[0]
except:
print '\tNo reciprocals.'
current_gene_id = first.gene_id
n_genes = 0
output_file.write('\tgene\t\tid\t\taccession\t\te-value\n')
for sequence in locus.reciprocals:
<ol>
<li>Limits to 5 most similar genes.</li>
</ol>


if sequence.gene_id == current_gene_id or n_genes &lt; 5:
print '\t%s, %s, %s, %s' % (
sequence.gene_name, sequence.gene_id,
sequence.ref, sequence.evalue
)

output_file.write('\t%s\t\t%s\t\t%s\t\t%.2e' % (
sequence.gene_name, sequence.gene_id,
sequence.ref, sequence.evalue
))
accession_numbers = [seq.ref for seq in locus.candidates.values()]
if sequence.ref in accession_numbers:
output_file.write(' &lt;&lt;')
output_file.write('\n')
else:
break
current_gene_id = sequence.gene_id
n_genes += 1
output_file.write('\n\n')
print
output_file.close()

wrote_fasta = SeqIO.write(fasta_loci, 'results.fa', 'fasta')

logger.info('%d records written to results.fa', wrote_fasta)
logger.info('Done, bye!')

if __name__ == '__main__':
<ol>
<li>Create logger.</li>
</ol>


logger = logging.getLogger('blaster')
logger.setLevel(logging.DEBUG)
logger.propagate = False
<ol>
<li>Define message format.</li>
</ol>


formatter = logging.Formatter('[%(levelname)s] %(asctime)s @ %(module)s %(funcName)s (l%(lineno)d): %(message)s')

<ol>
<li>Create logger console handler.</li>
</ol>


console_handler = logging.StreamHandler()
console_handler.setLevel(logging.DEBUG)
<ol>
<li>Format console output.</li>
</ol>


console_handler.setFormatter(formatter)
<ol>
<li>Add console to logger.</li>
</ol>


logger.addHandler(console_handler)

<ol>
<li>Create logger file handler.</li>
</ol>


file_handler = logging.FileHandler('log_blaster.log')
file_handler.setLevel(logging.DEBUG)
<ol>
<li>Format file output.</li>
</ol>


file_handler.setFormatter(formatter)
<ol>
<li>Add file to logger.</li>
</ol>


logger.addHandler(file_handler)

<ol>
<li>Initiates main function.</li>
</ol>


logger.info('BLASTer is running...')
main(sys.argv[1:])