#!/usr/bin/perl

##########################################################################################
#   This file is part of Proteinortho.
#   (C) 2009/2010 Marcus Lechner
#
#   Proteinortho is free software; you can redistribute it and/or modify
#   it under the terms of the GNU General Public License as published
#   by the Free Software Foundation; either version 2, or (at your
#   option) any later version.
#
#   Proteinortho is distributed in the hope that it will be useful, but
#   WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
#   General Public License for more details.
#
#   You should have received a copy of the GNU General Public License
#   along with Proteinortho; see the file COPYING.  If not, write to the
#   Free Software Foundation, Inc., 59 Temple Place - Suite 330,
#   Boston, MA 02111-1307, USA.
##########################################################################################
##########################################################################################
# About
##########################################################################################
# Proteinortho
# input fasta files with proteins
# output matrix with orthologous proteins
#
# @authors Marcus Lechner, Clemens Elias Thoelken, Paul Klemm
# @email lechner\@staff.uni-marburg.de
# @company University of Maruburg
# @date 2017-04-18
#
##########################################################################################

##MARK_FOR_NEW_BLAST_ALGORITHM => FLAG for adding a new blast algorithm.

### ============================================================================
### MAN-PAGE
###

=head1 NAME

proteinortho - Orthology detection tool (lechner\@staff.uni-marburg.de)

=head1 SYNOPSIS

proteinortho6.pl [options] <fasta files (one for each species, at least 2)>

ALIAS

proteinortho [options] <fasta files>

=head1 DESCRIPTION

B<proteinortho> is a tool to detect orthologous genes within different species.  For doing so, it compares similarities of given gene sequences and clusters them to find significant groups. The algorithm was designed to handle large-scale data and can be applied to hundreds of species at one. Details can be found in Lechner et al., BMC Bioinformatics. 2011 Apr 28;12:124.
To enhance the prediction accuracy, the relative order of genes (synteny) can be used as additional feature for the discrimination of orthologs. The corresponding extension, namely PoFF (manuscript in preparation), is already build in Proteinortho.

Proteinortho assumes, that you have all your gene sequences in FASTA format either
represented as amino acids or as nucleotides. The source code archive contains some examples, namely C.faa, E.faa, L.faa, M.faa located in the test/ directory.
I<By default Proteinortho assumes amino> I<acids and thus uses diamond> (-p=diamond) to compare sequences. If you have nucleotide sequences, you need to change this by adding the parameter
-p=blastn+ (or some other algorithm). (In case you have only have NCBI BLAST legacy installed, you need to tell this too - either by adding -p=blastp_legacy or -p=blastn_legacy respectively.)
The full command for the example files would thus be proteinortho6.pl -project=test test/C.faa test/E.faa test/L.faa test/M.faa.
Instead of naming the FASTA files one by one, you could also supply test/*.faa as argument.
Please note that the parameter -project=test is optional. With this, you can set the prefix of the output files generated by Proteinortho.
If you skip the project parameter, the default project name will be myproject.

=head1 OPTIONS

=head2 Main parameters

use the GUI proteinorthoHelper.html (downloadable at https://gitlab.com/paulklemm_PHD/proteinortho/raw/master/proteinorthoHelper.html?inline=false)

=over 4

=item B<--step>={0,1,2,3} (default: 0)

0 -> all. 1 -> prepare blast (build db). 2 -> run all-versus-all blast. 3 -> run the clustering.

=item B<--project>=name (default: myproject)

prefix for all resulting file names

=item B<--cpus>=number (default: all available)

the number of processors to use (multicore/processor support) for step 2 and step 3

=item B<--ram>=number (default: 75% of total memory)

maximal used ram threshold for LAPACK and the input graph in MB. By default 75% of the total memory is used

=item B<--verbose>={0,1,2} (default: 1)

verbose level. 1:keeps you informed about the progress

=item B<--silent>

sets the verbose level to 0.

=item B<--temp>=directory(.)

path to the temporary files

=item B<--force>

forces the recalculation of the blast results in any case in step=2. Also forces the recreation of the database generation in step=1

=item B<--clean>

removes all database files generated by the -p= algorithm afterwards

=back

=head2 Search options (step 1-2)

(output: <myproject>.blast-graph)

=over 4

=item B<--p>=algorithm (default: diamond)

B<blastn>,B<blastp>,B<tblastx>,B<blastn+>,B<blastp+>,B<tblastx+> : standard blast family. The suffix 'n' or 'p' indicates nucleotide or protein version (of the input files). Use *_legacy for legacy blast.

B<diamond> : Only for protein files! standard diamond procedure and for genes/proteins of length >40 with the additional --sensitive flag

B<lastn>,B<lastp> : lastal. -n : dna files, -p protein files (BLOSUM62 scoring matrix)!

B<rapsearch> : Only for protein files!

B<mmseqsp>,B<mmseqsn> : mmseqs2.

B<topaz> : Only for protein files!

B<usearch> : usearch_local procedure with -id 0 (minimum identity percentage).

B<ublast> : usearch_ublast procedure.

B<blatp>,B<blatn> : blat. -n : dna files, -p protein files

=item B<--e>=evalue (default: 1e-05)

E-value for blast

=item B<--selfblast>

apply selfblast, detects paralogs without orthologs

=item B<--sim>=float (default: 0.95)

min. similarity for additional hits

=item B<--identity>=number (default: 25)

min. percent identity of best blast hits

=item B<--cov>=number (default: 50)

min. coverage of best blast alignments in %

=item B<--subparaBlast>='options'

additional parameters for the search tool (-p=blast,diamond,...) example -subpara='-seg no' or -subpara='--more-sensitive' for diamond

=back

=head2 Synteny options (optional, step 2)

(output: <myproject>.ffadj-graph, <myproject>.poff-graph.tsv)

=over 4

=item B<--synteny>

activate PoFF extension to separate similar by contextual adjacencies (requires .gff for each .fasta)

=item B<--dups>=number (default: 0)

PoFF: number of reiterations for adjacencies heuristic, to determine duplicated regions

=item B<--cs>=number (default: 3)

PoFF: Size of a maximum common substring (MCS) for adjacency matches

=item B<--alpha>=number (default: .5)

PoFF: weight of adjacencies vs. sequence similarity

=back

=head2 Clustering options (step 3)

(output: <myproject>.proteinortho.tsv, <myproject>.proteinortho.html, <myproject>.proteinortho-graph)

=over 4

=item B<--singles>

report singleton genes without any hit

=item B<--purity>=float (default: 1e-7)

avoid spurious graph assignments

=item B<--conn>=float (default: 0.1)

min. algebraic connectivity

=item B<--minspecies>=float (default: 1, must be >=0)

min. number of genes per species. If a group is found with up to (minspecies) genes/species, it wont be split again (regardless of the connectivity).

=item B<--nograph>

do not generate .graph file (pairwise orthology relations)

=item B<--subparaCluster>='options'

additional parameters for the clustering algorithm (proteinortho_clustering) example -subparaCluster='-maxnodes 10000'.
Note: -rmgraph cannot be set. All other parameters of subparaCluster are replacing the default values (like -cpus or -minSpecies)

=item B<--xml>

do generate an orthoXML file (see http://www.orthoxml.org for more information). You can also use proteinortho2xml.pl <myproject.proteinortho>.

=item B<--exactstep3>

perform the clustering without the k-mere heuristic. The k-mere heuristic is only applied for very large connected components (>1e+6 nodes) and if the algorithm would start to iteratate very slowly

=item B<--mcl>

enables the mcl algorithm for clustering instead of power iteration or lapacks routine. (needs mcl to be installed, call 'mcl' to test if you installed mcl correctly)

=back

=head2 Misc options

=over 4

=item B<--checkfasta>

checks input fasta files if the given algorithm can process the given fasta file.

=item B<--cleanblast>

cleans blast-graph with proteinortho_cleanupblastgraph (either generated in step 2 or loaded in step 3)

=item B<--desc>

write description files (for NCBI FASTA input only)

=item B<--binpath>=directory (default: PATH)

path to your local executables (blast, diamond, mcl, ...)

=item B<--debug>

gives detailed information for bug tracking

=back

=head2 Large compute jobs

=over 4

=item B<--jobs>=M/N

If you want to involve multiple machines or separate a Proteinortho run into smaller chunks, use the -jobs=B<M>/B<N> option.
First, run 'proteinortho6.pl -steps=1 ...' to generate the indices.
Then you can run 'proteinortho6.pl -steps=2 -jobs=B<M>/B<N> ...' to run small chunks separately.
Instead of B<M> and B<N> numbers must be set representing the number of jobs you want to divide the run into (B<M>) and the job division to be performed by the process.
E.g. to divide a Proteinortho run into 4 jobs to run on several machines, use

=back

=head1 Output

=head2 BLAST Search (step 1-2)

=over 4

=item <myproject>.B<blast-graph>

filtered raw blast data based on adaptive reciprocal best blast
matches (= reciprocal best match plus all reciprocal matches within a
range of 95% by default) The first two rows are just comments
explaining the meaning of each row. Whenever a further comment line (starting
with #) follows, it indicates results comparing the two species is
about to follow. E.g. # M.faa L.faa tells that the next lines representing
results for species M and L. All matches are reciprocal matches. If
e.g. a match for M_15 L_15 is shown, L_15 M_15 exists implicitly.
E-Values and bit scores for both directions are given behind each
match. The 4 comment numbers ('# 3.8e-124        434.9...') are representing the median values of
evalue_ab, bitscore_ab, evalue_ba and bitscore_ba.

  ----------------------------------------
  # file_a  file_b
  # a b evalue_ab bitscore_ab evalue_ba bitscore_ba
  # E.faa C.faa
  # 3.8e-124        434.9   2.8e-126        442.2
  E_10  C_10  3.8e-124  434.9 2.8e-126  442.2
  E_11  C_11  5.9e-51 190.7 5.6e-50 187.6
  ----------------------------------------

=back

=head2 Clustering (step 3)

=over 4

=item <myproject>.B<proteinortho-graph>

clustered myproject.blast-graph. Its connected components are represented in myproject.proteinortho.tsv. The format is the same as the blast-graph (see above).

  ----------------------------------------
  # file_a  file_b
  # a b evalue_ab bitscore_ab evalue_ba bitscore_ba
  # E.faa C.faa
  E_10  C_10  3.8e-124  434.9 2.8e-126  442.2
  E_11  C_11  5.9e-51 190.7 5.6e-50 187.6
  ----------------------------------------

=item <myproject>.B<proteinortho>

The connected components.
The first line starting with #is a comment line indicating the meaning of each column for each of the following lines which represent an orthologous group each.
The very first column indicates the number of species covered by this group. The second column indicates the number of genes included in the group.
Often, this number will equal the number of species, meaning that there is a single ortholog in each species.
If the number of genes is bigger than the number of species, there are co-orthologs present.
The third column gives rise to the algebraic connectivity of the respective group. Basically, this indicates how densely the genes are connected in the orthology graph that was used for clustering.
A connectivity of 1 indicates a perfect dense cluster with each gene similar to each other gene.
By default, Proteinortho splits each group into two more dense subgroups when the connectivity is below 0.1

  ----------------------------------------
  # Species Genes Alg.-Conn.  C.faa C2.faa  E.faa L.faa M.faa
  2 5 0.16  * * * L_643,L_641 M_649,M_640,M_642
  3 6 0.138 C_164,C_166,C_167,C_2 * * L_2 M_2
  2 4 0.489 * * * L_645,L_647 M_644,M_646
  ----------------------------------------

=back

=head2 POFF (-synteny)

The synteny based graph files (myproject.ffadj-graph and myproject.poff-graph) have two additional columns:
same_strand and simscore. The first one indicates if two genes from a match are located at the same strands (1) or not (-1).
The second one is an internal score which can be interpreted as a normalized weight ranging from 0 to 1 based on the respective e-values.
Moreover, a second comment line is followed after the species lines, e.g.

  ----------------------------------------
  # M.faa L.faa
  # Scores: 4     39      34.000000       39.000000
  ----------------------------------------

=over 4

=item <myproject>.B>ffadj-graph>

filtered blast data based on adaptive reciprocal best blast matches and synteny (only if -synteny is set)

=item <myproject>.B<poff-graph>

clustered ffadj graph. Its connected components are represented in myproject.poff.tsv (only if -synteny is set)

=back

=head1 EXAMPLES

=head2 Calling proteinortho

Sequences are typically given in plain fasta format like the files in test/

  test/C.faa:
  ----------------------------------------
  >C_10
  VVLCRYEIGGLAQVLDTQFDMYTNCHKMCSADSQVTYKEAANLTARVTTDRQKEPLTGGY
  HGAKLGFLGCSLLRSRDYGYPEQNFHAKTDLFALPMGDHYCGDEGSGNAYLCDFDNQYGR
  ...
  ----------------------------------------

  test/E.faa:
  ----------------------------------------
  >E_10
  CVLDNYQIALLRNVLPKLFMTKNFIEGMCGGGGEENYKAMTRATAKSTTDNQNAPLSGGF
  NDGKMGTGCLPSAAKNYKYPENAVSGASNLYALIVGESYCGDENDDKAYLCDVNQYAPNV
  ...
  ----------------------------------------

To run proteinortho for these sequences, simply call

  proteinortho6.pl test/C.faa test/E.faa test/L.faa test/M.faa

To give the outputs the name 'test', call

  proteinortho6.pl -project=test test/*faa

To use blast instead of the default diamond, call

  proteinortho6.pl -project=test -p=blastp+ test/*faa

=head2 POFF

The PoFF extension allows you to use the relative order of genes (synteny) as an additional criterion to disentangle complex co-orthology relations.
To do so, add the parameter -synteny.
You can use it to either come closer to one-to-one orthology relations by preferring synthetically conserved copies in the presence of two very similar paralogs (default),
or just to reduce noise in the predictions by detecting multiple copies of genomic areas (add the parameter -dups=3).
Please note that you need additional data to include synteny, namely the gene positions in GFF3 format.
As Proteinortho is primarily made for proteins, it will only accept GFF entries of type CDS (column #3 in the GFF-file).
The attributes column (#9) must contain Name=GENE IDENTIFIER where GENE IDENTIFIER corresponds to the respective identifier in the FASTA format.
It may not contain a semicolon (;)! Alternatively, you can also set ID=GENE IDENTIFIER.
Example files are provided in the source code archive.
Hence, we can run proteinortho6.pl -project=test -synteny test/A1.faa test/B1.faa test/E1.faa test/F1.faa to add synteny information to the calculations.
Of course, this only makes sense if species are sufficiently similar. You won't gain much when comparing e.g. bacteria with fungi.
When the analysis is done you will find an additional file in your current working directory, namely test.poff.tsv.
This file is equivalent to the .proteinortho.tsv file (above) but can be considered more accurate as synteny was involved for its construction.

=head1 Hints

Using .faa to indicate that your file contains amino acids and .fna to show it contains nucleotides makes life much easier.

Sequence IDs must be unique within a single FASTA file. Consider renaming otherwise. Note: Till version 5.15 sequences IDs had to be unique among the whole dataset. Proteinortho now keeps track of name and species to avoid the necessissity of renaming.

You need write permissions in the directory of your FASTA files as Proteinortho will create blast databases. If this is not the case, consider using symbolic links to the FASTA files.

The directory src contains useful tools, e.g. proteinortho_grab_proteins.pl which fetches protein sequences of orthologous groups from Proteinortho output table

=head1 AUTHORS

Marcus Lechner (lechner\@staff.uni-marburg.de), Clemens Elias Thoelken, Paul Klemm

=head1 ONLINE INFORMATION

For download and online information, see
L<https://www.bioinf.uni-leipzig.de/Software/proteinortho/>

=head1 REFERENCES

Lechner, M., Findeisz, S., Steiner, L., Marz, M., Stadler, P. F., & Prohaska, S. J. (2011). Proteinortho: detection of (co-) orthologs in large-scale analysis. BMC bioinformatics, 12(1), 124.

=cut

##########################################################################################
# Imports
##########################################################################################
use strict;
use warnings "all";
use File::Basename;
use threads;
use threads::shared;
use Thread::Queue;
use Pod::Usage; # --man
use POSIX;
#use Term::ReadKey; # press c to cancel, any other key to continue
##########################################################################################
# Variables
##########################################################################################
our $version = "6.0.12";
our $step = 0;    # 0/1/2/3 -> do all / only apply step 1 / only apply step 2 / only apply step 3
our $verbose = 1; # 0/1   -> don't / be verbose
our $debug = 0;   # 0/1   -> don't / show debug data
our $exactstep3 = 0;
#our $reflexiv = 0; # 0/1   -> check sets against themselves
our $synteny = 0; # 0/1   -> Apply synteny algorithm
our $neighbourjoin = 0; # 0/1   -> Merge neighbours
our $duplication = 2; # 0-9 not 1 -> Repeats for duplication extension
our $cs = 3;  # int   -> cs-value
our $alpha = 0.5; # Alpha value for src/proteinortho_ffadj_mcs.py
our $connectivity = "0.1"; # min algebraic connectivity (normal eigenvalue) for clustering step (-step=3), not the evalue cutoff for convergence
our $cpus   = 0;  # 0 = autodetect
our $evalue = "1e-05"; # evalue cutoff for blast step (-step=2), different to the clustering evalue cutoff (set in proteinortho_clustering.cpp)
our $purity = "1e-07"; # for clustering step: if entries (absolute value) of the vector are blow the purity -> treated as '0'
our $coverage = 50; # Percent coverage threshold for two proteins
our $identity = 25; # Percent identity threshold for two proteins
our $blastmode = "diamond";
#our $tmpdir = "./";  # Dir for tmp-files
our $sim = 0.95;
our $report = 3;
our $startat = undef; # removed 5.16
our $stopat = undef;  # removed 5.16
our $keep = 0;
our $force = 0;
our $selfblast = 0;
our $twilight = 0;
our $singles = 0;
our $clean = 0;
our $blastOptions = "";
our $clusterOptions = "";
our $nograph = 0;
our $doxml = 0;
our $desc = 0;
our $tmp_path = "";
our $useMcl = 0;

# Internal
our $blastversion = "unknown";  # Auto-detected blastmode version
our $binpath = "";
our $makedb = "";   # makedb command depends on blastmode
our $blast = "";    # blast command
our $jobque = Thread::Queue->new(); # Jobs todo
our $jobs_done:shared = 0;    # Counter
our $jobs_todo = 0;     # Sum of jobs
our $project = "myproject";   # Project name
our $graph_lock :shared;
our $syn_lock :shared;
our $all_jobs_submitted :shared = 0;
our $po_path = "";
our $run_id = "";
our %gene_counter;    # Holds the number of genes for each data file (for sorting)
our %max_gene_length_diamond;    # Holds the maximum length of genes for each data file (for diamond -> -sensitive option)
our $threads_per_process :shared = 1; # Number of subthreads for blast

our $freemem_inMB = -1; # -1 = detect automatically with 'free -m'

# Split work
our $split_to_X_jobs = -1;
our $jobnumber = -1;
our $part = -1;

our $checkblast=0;
our $checkfasta=0;
our $minspecies=1;

our $RED="\033[1;31m";
our $RED2="\033[1;31m";
our $GREEN="\033[1;32m";
our $BLUE="\033[1;36m";
our $ORANGE="\033[1;33m";
our $NC="\033[0m"; # No Color

my $tput=`tput color 2>/dev/null`; # test if the shell supports colors
$tput=~s/[\r\n]+$//;
if ($tput=~m/[^0-9]|^&/ && $tput <16) {
  $RED="";
  $GREEN="";
  $ORANGE="";
  $NC="";
}

# Find some Debian packaged software under their original names
$ENV{PATH} = "$ENV{PATH}:/usr/lib/debian-med/bin";

##########################################################################################
# Parameters
##########################################################################################
our @files = ();
our %files_map;
foreach my $option (@ARGV) {
  if ($option =~ m/^--?step=(0|1|2|3)$/)      { $step = $1;   }
  elsif ($option =~ m/^--?verbose$/)      { $verbose = 1;  }
  elsif ($option =~ m/^--?verbose=([012])$/)      { $verbose = $1;  }
  elsif ($option =~ m/^--?silent$/)      { $verbose = 0;  }
  elsif ($option =~ m/^--?unique$/)       { $checkblast = 1;  }
  elsif ($option =~ m/^--?checkblastgraph$/)      { $checkblast = 1;  }
  elsif ($option =~ m/^--?checkblast$/)       { $checkblast = 1;  }
  elsif ($option =~ m/^--?checkfasta$/)       { $checkfasta = 1;  }
  elsif ($option =~ m/^--?check$/)       { $checkfasta = 1; $checkfasta = 1; }
  elsif ($option =~ m/^--?cleanblast$/)       { $checkblast = 1;  }
  elsif ($option =~ m/^--?cleanupblast$/)       { $checkblast = 1;  }
  elsif ($option =~ m/^--?verbose=(0|1)$/)    { $verbose = $1;  }
  elsif ($option =~ m/^--?te?mp=(.+)$/)     { $tmp_path = $1;
                # make sure it ends with /
                unless ($tmp_path =~ /\/$/) {$tmp_path .= "/";}my $pwd=$ENV{"HOME"}; $tmp_path=~s/~/$pwd/g;
                if(! -d $tmp_path || ! -R $tmp_path || ! -W $tmp_path ){
                  &Error(" -tmp=$tmp_path is not accessible. Check if the directory exists and is read- and writable.");
                }else{
                  if($tmp_path =~ m/proteinortho_cache/){
                    $keep=1;
                  }
                }
              }
  elsif ($option =~ m/^--?debug$/)          { $debug = 1;  }
  elsif ($option =~ m/^--?exactstep3$/)           { $exactstep3 = 1;  }
  elsif ($option =~ m/^--?debug=([\da-zA-Z_]*)$/)         { $debug = $1;  }
  elsif ($option =~ m/^--?p=(.*)$/)         { $blastmode = $1; if($blastmode eq "blastn"){$blastmode.="+";} if($blastmode eq "blastp"){$blastmode.="+";}}
  elsif ($option =~ m/^--?e=(.*)$/)       { $evalue = $1; }
  elsif ($option =~ m/^--?cpus=(\d*)$/)       { $cpus = $1; }
  elsif ($option =~ m/^--?cpus=auto$/)          { $cpus = 0; }
  elsif ($option =~ m/^--?alpha=([0-9\.]+)$/)     { $alpha = $1; }
  elsif ($option =~ m/^--?purity=([0-9\.]+)$/)      { $purity = $1; }
  elsif ($option =~ m/^--?report=([0-9]+)$/)      { $report = $1; }
  elsif ($option =~ m/^--?minspecies=([0-9.]+)$/)       { if($1>=0){$minspecies = $1;}else{&Error("the argument -minspecies=$1 is invalid.$RED minspecies needs to be >=0!$NC\nminspecies: the min. number of genes per species. If a group is found with up to (minspecies) genes/species, it wont be split again (regardless of the connectivity).");} }
  elsif ($option =~ m/^--?conn=([0-9\.]+)$/)      { $connectivity = $1; }
  elsif ($option =~ m/^--?cov=([0-9]+)$/)       { $coverage = $1; }
  elsif ($option =~ m/^--?mcl$/)        { $useMcl = 1; }
  elsif ($option =~ m/^--?binpath=(.+)$/)     { $binpath  = $1."/"; if($binpath=~m/~/){my $pwd=$ENV{"HOME"}; $binpath=~s/~/$pwd/;} $binpath=~s/\/\/+/\//g; }
  elsif ($option =~ m/^--?identity=([0-9]+)$/)      { $identity = $1; }
  elsif ($option =~ m/^--?memory=([0-9]+)$/)      { $freemem_inMB = $1; }
  elsif ($option =~ m/^--?mem=([0-9]+)$/)     { $freemem_inMB = $1; }
  elsif ($option =~ m/^--?freememory=([0-9]+)$/)      { $freemem_inMB = $1; }
  elsif ($option =~ m/^--?ram=([0-9]+)$/)     { $freemem_inMB = $1; }
  elsif ($option =~ m/^--?identity=twilight$/)    { $twilight = 1; }
  elsif ($option =~ m/^--?sim=([0-9\.]+)$/)       { $sim = $1; }
  elsif ($option =~ m/^--?startat=([0-9]+)$/)     { $startat = $1; }
  elsif ($option =~ m/^--?stopat=([0-9]+)$/)      { $stopat = $1; }
  elsif ($option =~ m/^--?jobs?=([\d]+)\/([\d]+)$/)   { $jobnumber = $1; $split_to_X_jobs = $2; }
  elsif ($option =~ m/^--?selfblast$/)      { $selfblast = 1; }
  elsif ($option =~ m/^--?selfblast=(0|1)$/)    { $selfblast = $1; }
  elsif ($option =~ m/^--?singles$/)      { $singles = 1; }
  elsif ($option =~ m/^--?singles=(0|1)$/)    { $singles = $1; }
  elsif ($option =~ m/^--?poff$/)       { $synteny = 1;  } #print STDERR "$ORANGE"."[WARNING]$NC -->> This option is deprecated <<---";
  elsif ($option =~ m/^--?synteny$/)      { $synteny = 1;  } #print STDERR "$ORANGE"."[WARNING]$NC -->> This option is deprecated <<---"; 
  elsif ($option =~ m/^--?synteny=(0|1)$/)    { $synteny = $1; } #print STDERR "$ORANGE"."[WARNING]$NC -->> This option is deprecated <<---"; 
  elsif ($option =~ m/^--?dups=0$/)     { $duplication = 0; }
  elsif ($option =~ m/^--?dups=([1-8])$/)   { $duplication = $1+1;}
  elsif ($option =~ m/^--?neighbourjoin$/)    { $neighbourjoin = 1; }
  elsif ($option =~ m/^--?neighbourjoin=(0|1)$/)  { $neighbourjoin = $1; }
  elsif ($option =~ m/^--?cs=([0-9]+)$/)    { $cs = $1; }
  elsif ($option =~ m/^--?keep$/)     { $keep = 1; }
  elsif ($option =~ m/^--?force$/)      { $force = 1; }
  elsif ($option =~ m/^--?clean$/)     { $clean = 1; }
  elsif ($option =~ m/^--?help$/)     { &print_header;print_usage_more(); exit 0;}
  elsif ($option =~ m/^--?test$/)     { &check_bins; &get_po_path; print "All necessary proteinortho_* binaries are found.\n"; exit 0;}
  elsif ($option =~ m/^--?h$/)     { &print_header;print_usage(); exit 0;}
  elsif ($option =~ m/^--?nograph$/)      { $nograph = 1; }
  elsif ($option =~ m/^--?xml$/)      { $doxml = 1; }
  elsif ($option =~ m/^--?graph$/)      { $nograph = 0; }
  elsif ($option =~ m/^--?desc$/)     { $desc = 1; }
  elsif ($option =~ m/^--?project=(.*)$/)   { $project = $1; $project=~s/[\/* \t\:\~\&\%\$\§\"\(\)\[\]\{\}\^\\]//g; }
  elsif ($option =~ m/^--?subparaBlast=(.*)$/i)  { $blastOptions = $1;}
  elsif ($option =~ m/^--?subparaCluster=(.*)$/i)  { $clusterOptions = $1;}
  elsif ($option =~ m/^--?v(ersion)?$/i)  { print $version."\n"; exit 0;}
  elsif ($option !~ /^-/)       { if(!exists($files_map{$option})){$files_map{$option}=1;push(@files,$option);}else{print STDERR "$ORANGE"."[WARNING]$NC The input $option was is skipped, since it was allready given as input.$NC\nPress 'strg+c' to prevent me from proceeding or wait 10 seconds to continue...\n";sleep 10;print STDERR "Well then, proceeding...\n"} }
  elsif ($option =~ m/^--?man$/i){ my $bperldoc=(1-(`perldoc -l Pod::Usage 2>&1`=~m/need to install/)); if( !$bperldoc ){ print STDERR "You need to install the perl-doc package to use this man program.\n"; }else{ pod2usage(-exitstatus => 0, -verbose => 2) } exit 0; }
  else  {&print_usage(); &reset_locale();die "$RED"."[Error]$NC $ORANGE Invalid command line option: \'$option\'! $NC\n\n"; }
}

if($jobnumber != -1 && $tmp_path eq "" ){print STDERR "$ORANGE"."[WARNING]$NC You should use the -tmp option in combination with -jobs.\n";}
if($selfblast){$checkblast=1;}

$po_path = &get_po_path();    # Determine local path

our $nucleotideAlphabet="ACGTURYSWKMBDHVNXacgturyswkmbdhvnx\.\-";
our $aminoAlphabet="XOUBZACDEFGHIKLMNPQRSTVWYxoubzacdefghiklmnpqrstvwy\.\*\-";
our $allowedAlphabet = {
  'blastn_legacy' => 'n' ,
  'blastp_legacy' => 'a' ,
  'tblastx' => 'n' ,
  'blastn+' => 'n' ,
  'blastp+' => 'a' ,
  'tblastx+' => 'n' ,
  'diamond' => 'a' ,
  'rapsearch' => 'a' ,
  'blatp' => 'a' ,
  'blatn' => 'n' ,
  'blatx' => 'n' ,
  'mmseqsn' => 'n' ,
  'mmseqsp' => 'a' ,
  'lastp' => 'a' ,
  'lastn' => 'n' ,
  'topaz' => 'a' };
##MARK_FOR_NEW_BLAST_ALGORITHM
our $blastmode_pendant = { # if you choose blastp+ and the input is nucleotide -> choose the pendant blastn+ and restart (only if you check files with -checkfasta)
  'blastn_legacy' => 'blastp_legacy' ,
  'blastp_legacy' => 'blastn_legacy' ,
  'blastn+' => 'blastp+' ,
  'blastp+' => 'blastn+' ,
  'rapsearch' => 'a' ,
  'blatp' => 'blatn' ,
  'blatn' => 'blatp' ,
  'mmseqsn' => 'mmseqsp' ,
  'mmseqsp' => 'mmseqsn' ,
  'lastp' => 'lastn' ,
  'lastn' => 'lastp' };
##MARK_FOR_NEW_BLAST_ALGORITHM

my $restart_counter=-1; # restart only once


##########################################################################################
# Check parameters
##########################################################################################
if (defined($startat) || defined($stopat)) {
  &Error("Sorry, -startat and -stopat were removed. Please use -jobs=M/N for more flexible job splitting.");
}

if ($split_to_X_jobs == 0) {
  &Error("Job parameter use incorrectly. Number of jobs cannot be 0. Valid format: M/N");
}

if ($jobnumber != -1 && ($jobnumber == 0 || $jobnumber > $split_to_X_jobs)) {
  &Error("Job parameter use incorrectly. Job number cannot be 0 or greater than number of jobs. Valid format: M/N");
}
if ($jobnumber != -1) {
  if ($step != 2) {&Error("Parameter -jobs only works for step 2!");}
  $run_id = "_$jobnumber"."_".$split_to_X_jobs;
}


our $gccversionstr = `gcc --version 2>/dev/null`;
our $gccversion_main = "";
if ($? == 0) {
  $gccversion_main=($gccversionstr =~ m/^[^\n]+ ([\d]+)\.[\d\.]+\n/);
}
our $ompprocbind="close";
if($gccversion_main eq "" || $gccversion_main < 5){
  $ompprocbind="true";
}


our $simgraph = "$project.blast-graph$run_id";    # Output file graph
our $syngraph = "$project.ffadj-graph$run_id";    # Output file synteny

our $csimgraph = "$project.proteinortho-graph";   # Output file graph
our $csyngraph = "$project.poff-graph";       # Output file synteny
our $simtable = "$project.proteinortho.tsv";      # Output file graph
our $simtablehtml = "$project.proteinortho.html";      # Output file graph
our $syntable = "$project.poff.tsv";          # Output file synteny
our $syntablehtml = "$project.poff.html";          # Output file synteny
our $desctable = "$project.descriptions";     # Output file seq descriptions

our $LC_All = $ENV{'LC_All'};
our $LC_NUMERIC = $ENV{'LC_NUMERIC'};

# set the locale enviroment variable such that sort works with scientific numbers
$ENV{'LC_All'}='C';
$ENV{'LC_NUMERIC'}='C';

sub reset_locale{
  $ENV{'LC_NUMERIC'}=$LC_NUMERIC;
  $ENV{'LC_All'}=$LC_All;
}

sub get_parameter{
  return "Parameter-vector : (",'version',"=$version",",",'step',"=$step",",",'verbose',"=$verbose",",",'debug',"=$debug",",",'exactstep3',"=$exactstep3",",",'synteny',"=$synteny",",",'duplication',"=$duplication",",",'cs',"=$cs",",",'alpha',"=$alpha",",",'connectivity',"=$connectivity",",",'cpus',"=$cpus",",",'evalue',"=$evalue",",",'purity',"=$purity",",",'coverage',"=$coverage",",",'identity',"=$identity",",",'blastmode',"=$blastmode",",",'sim',"=$sim",",",'report',"=$report",",",'keep',"=$keep",",",'force',"=$force",",",'selfblast',"=$selfblast",",",'twilight',"=$twilight",",",'singles',"=$singles",",",'clean',"=$clean",",",'blastOptions',"=$blastOptions",",",'nograph',"=$nograph",",",'xml',"=$doxml",",",'desc',"=$desc",",",'tmp_path',"=$tmp_path,",'blastversion',"=$blastversion",",",'binpath',"=$binpath",",",'makedb',"=$makedb",",",'blast',"=$blast",",",'jobs_todo',"=$jobs_todo",",",'project',"=$project",",",'po_path',"=$po_path",",",'run_id',"=$run_id",",",'threads_per_process',"=$threads_per_process",",","useMcl","=$useMcl",',freemem',"=$freemem_inMB",")\n";
}

if (-e "$project.proteinortho.tsv" && $step!=1 && ( ( scalar(@files) > 2 && $step!=3 ) || $step==3 ) ) {
  print STDERR "!!!$ORANGE\n[Warning]:$NC Data files for project '$project' already exists. Previous output files might be overwritten.\n$NC";
  print STDERR "Press 'strg+c' to prevent me from proceeding or wait 10 seconds to continue...\n!!!\n";
    sleep 10;
  print STDERR "\nWell then, proceeding...\n\n";
}

# if($step!=1){
  if($tmp_path eq ""){$tmp_path=".";}
  if( $tmp_path !~ qr/proteinortho_cache_$project/ ){
    if(! -d $tmp_path."/proteinortho_cache_$project"){
      system("mkdir $tmp_path/proteinortho_cache_$project");
      if( -d $tmp_path."/proteinortho_cache_$project"){
        $tmp_path.="/proteinortho_cache_$project/";
      }
    }else{
      $tmp_path.="/proteinortho_cache_$project/";
    }
  }else{
    if(!-d $tmp_path){
     mkdir("$tmp_path");
    }
  }
# }

our $rm_simgraph = "$tmp_path$project.removed_blast-graph";   # Output remove graph
our $rm_syngraph = "$tmp_path$project.removed_ffadj-graph";   # Output remove graph

RESTART:

$restart_counter++;

##########################################################################################
# Run
##########################################################################################
if(scalar @ARGV >1){
  open(INFO,">>$project.info") || &Error("Could not open graph '$project.info': $!");
  print INFO "SYSTEMCALL of ".localtime(time()).":\nproteinortho ".join(" ",@ARGV)."\n";
  print INFO &get_parameter;
  print INFO "--------\n";
  close(INFO);
}

&print_header;    # Show Proteinortho Header
&auto_cpus;   # Set number of CPUs
if($step < 3){ # don't check blast-bins (e.g. diamond) for step 3=clustering (not necessary)
  &check_bins;    # Check blast,hmmer,diamond,... version
}

# Always do
&check_files;   # Check files, count sequences
@files = ();
foreach my $file (sort { if ($gene_counter{$a} == $gene_counter{$b}) {$a cmp $b;} else {$gene_counter{$b} <=> $gene_counter{$a};} } keys %gene_counter) {push(@files,$file);} # Biggest first # Alphabet otherwise 5.16

# Step 1, check files and make indices
if ($step == 0 || $step == 1) {
  if($verbose){print STDERR "\n$GREEN**Step 1**$NC\n";}
  &generate_indices(@files);  # Generate index files for blast
  if ($desc) {
    &write_descriptions;  # Write sequence description file
  }
}

# Step 2, run blast and synteny algorithm
if ($step == 0 || $step == 2) {
  if($verbose){print STDERR "\n$GREEN**Step 2**$NC using $blastmode ";
  if($synteny || $selfblast){print STDERR "with :";}
  if($synteny){print STDERR " synteny";}
  if($selfblast){print STDERR " selfblast";}
  print STDERR "\n"; }

  &init_graph_output; # Initiate Output file(s)
  &run_blast;   # Run blasts
}

# Check for duplicated edges if --unique or --checkblast
if($checkblast == 1){

  print STDERR "\n$GREEN**Step 2.5 Checking blast graph(s)**$NC\n";

  if($synteny){
    system("$po_path/proteinortho_cleanupblastgraph $syngraph"."* >tmp_$syngraph 2>/dev/null && mv tmp_$syngraph $syngraph"."_clean");
    $syngraph.="_clean";
    print STDERR "A clean version without any duplicated edges of $syngraph is saved to $syngraph"."_clean\n";
  }else{
    system("$po_path/proteinortho_cleanupblastgraph $simgraph"."* >tmp_$simgraph 2>/dev/null && mv tmp_$simgraph $simgraph"."_clean");
    $simgraph.="_clean";
    print STDERR "A clean version without any duplicated edges of $simgraph is saved to $simgraph"."_clean\n";
  }
}
# Step 3, spacial clustering
if ($step == 0 || $step == 3) {
  if($verbose){print STDERR "\n$GREEN**Step 3**$NC\n";}
  &cluster;             # form clusters and write outputs
  if($verbose){print STDERR "\n$GREEN"."All finished.$NC\n";}
}
if ($clean) {&clean;}           # remove blast indices

if ($nograph) {
  unlink($simgraph);
  if ($synteny) {unlink($syngraph);}
}

#clean up tmp dir
if (!$keep && $tmp_path =~ m/\/proteinortho_cache_[^\/]+\d*\/$/ && $step!=1){system("rm -r $tmp_path >/dev/null 2>&1");}

##########################################################################################
# Functions
##########################################################################################

sub clean {
  print STDERR "Removing temporary files...\n";
  foreach my $file (@files) {
    system("rm $file.$blastmode"."*");
  }

}

sub cluster {

  if(!$useMcl){

    # run the free command to calculate the free memory (in MB)
    if($freemem_inMB == -1){
      $freemem_inMB = `free -m 2>/dev/null`;
        if(defined($freemem_inMB) && $freemem_inMB){
        my @freemem_arr = split("\n",$freemem_inMB);
        $freemem_inMB=2048; #default value 2GiB
        if(scalar @freemem_arr>0){
          my @freemem_arr2 = split(" ",$freemem_arr[1]); #first row after header = Mem
          my $ci = 0;
          foreach (@freemem_arr2) {
            if($_ ne ""){
              if($ci eq 1){ # first column = total
                $freemem_inMB=$_*0.75;
                if($verbose){print STDERR "Clustering by similarity (Proteinortho mode) using up to $freemem_inMB MB of memory (75% of total memory) and $cpus cpu core(s). Adjust this behaviour with the -mem option.\n";}
                last;
              }
              $ci=$ci+1;
            }
          }
        }
      }else{
        $freemem_inMB=16384;
        if($verbose){print STDERR "Clustering by similarity (Proteinortho mode) using up to $freemem_inMB MB of memory (default value, command 'free' not found) and $cpus cpu core(s). Adjust this behaviour with the -mem option.\n";}
      }
    }else{
      if($verbose){print STDERR "Clustering by similarity (Proteinortho mode) using up to $freemem_inMB MB of memory and $cpus cpu core(s). Adjust this behaviour with the -mem option.\n";}
    }
    #if($freemem_inMB < 5){if($verbose){print STDERR "$ORANGE"."[WARNING]$NC Increased the maximal memory threshold to 5 MB ($freemem_inMB MB is not sufficent).$NC\n";}$freemem_inMB=5;} # minimum 5 MB
    if($freemem_inMB < 1000){if($verbose){print STDERR "$ORANGE"."[WARNING]$NC The memory threshold of $freemem_inMB MB seems somewhat small. I will increase the threshold to 1 GB (-ram=1000).$NC\n";}$freemem_inMB=1024;} # minimum 5 MB
      }else{
    if($verbose){print STDERR "$ORANGE"."[WARNING]$NC Using MCL for clustering (-ram has no effect).$NC\n";} # minimum 5 MB
  }

  if(!$useMcl){
    system ("OMP_PROC_BIND=$ompprocbind $po_path/proteinortho_clustering -minspecies $minspecies -ram ".$freemem_inMB." -kmere ".(1-$exactstep3)." -debug $debug -cpus $cpus -weighted 1 -verbose $verbose -conn $connectivity -purity $purity $clusterOptions -rmgraph '$rm_simgraph' $simgraph* >'$simtable'");
    if ($? != 0) {
          &Error("proteinortho_clustering failed. Did you use the static version?\nMaybe your operating system does not support the statically compiled version, please try recompiling proteinortho with 'make clean' and 'make' (and 'make install PREFIX=...').");
        }
    }else{
  my $stderrout = `$po_path/proteinortho_do_mcl.pl $cpus $simgraph*`;
    if ($? != 0) {
      print STDERR $stderrout;
      &Error("proteinortho_do_mcl.pl failed. Do you have a woring mcl version installed? (Try 'mcl' in the terminal) If not please install with e.g. 'apt-get install mcl'.");
    }
    system("mv mcl.proteinortho $simtable");
    system("mv mcl.proteinortho-graph $csimgraph");
  }

  if($verbose){print STDERR "[OUTPUT] -> Orthologous groups are written to $simtable\n";}
  if(scalar @files < 10){
    if($verbose){print STDERR "You can extract the fasta files of each orthology group with 'proteinortho_grab_proteins.pl -tofiles $simtable ".join(" ",@files)."'\n (Careful: This will generate a file foreach line in the file $simtable).\n";}
  }
  
  if ($singles) {
    if($verbose){print STDERR "Adding singles...\n";}
    my $fastas = "'".join("' '",@files)."'";
    system("$po_path/proteinortho_singletons.pl $fastas <'$simtable' >>'$simtable'");
  }


  if (!$nograph && !$useMcl) {
    system("$po_path/proteinortho_graphMinusRemovegraph '$rm_simgraph' $simgraph* >'$csimgraph'");
    unless ($keep) {unlink($rm_simgraph);}
    if($verbose){print STDERR "[OUTPUT] -> Orthologous pairs are written to $csimgraph\n";}
  }elsif($useMcl){
    if($verbose){print STDERR "[OUTPUT] -> Orthologous pairs are written to $csimgraph\n";}
  }
  if(-x "$po_path/proteinortho_summary.pl"){
    system("$po_path/proteinortho_summary.pl '$csimgraph' >>'$csimgraph.summary'");
  }

  if(scalar @files < 10){
    system("perl $po_path/proteinortho2html.pl $simtable ".join(" ",@files)." >$simtablehtml");
    if($verbose){print STDERR "[OUTPUT] -> Orthologous groups are written to $simtablehtml\n";}
  }else{
    if($verbose){print STDERR "[OUTPUT] -> You can extract a html version of the output using :\nproteinortho2html.pl $simtable [PLACE FASTA FILES HERE] >$simtablehtml\n\n";}
  }

  if ($doxml) {
    system("perl $po_path/proteinortho2xml.pl $simtable >$simtable.xml");
    if ($? != 0) {system("rm $simtable.xml"); &Error("proteinortho2xml.pl failed");}
    if($verbose){print STDERR "[OUTPUT] -> Generated the ortho-xml file $simtable.xml";}
  }

  if ($synteny) {
    if($verbose){print STDERR "\n"."Clustering by gene-order ($GREEN"."POFF mode$NC)\n";}
    if(!$useMcl){
      system ("OMP_PROC_BIND=$ompprocbind $po_path/proteinortho_clustering $clusterOptions -minspecies $minspecies -ram ".$freemem_inMB." -kmere ".(1-$exactstep3)." -debug $debug -cpus $cpus -weighted 1 -verbose $verbose -conn $connectivity -purity $purity $clusterOptions -rmgraph '$rm_syngraph' $syngraph* >'$syntable'");
      if ($? != 0) {
        &Error("proteinortho_clustering failed. Did you use a static version? Maybe your operating system does not support the static compiled version, please recompile 'make clean' and 'make' or 'make USEPRECOMPILEDLAPACK=FALSE'.");
      }
    }else{
      system($po_path.'/proteinortho_do_mcl.pl '.$cpus.' '.$syngraph.'*');
      system("mv mcl.proteinortho $syntable");
      system("mv mcl.proteinortho-graph $csyngraph");
    }

    if($verbose){print STDERR "[OUTPUT] -> Orthologous groups are written to $syntable\nYou can extract the fasta files of each orthology group with 'proteinortho_grab_proteins.pl -tofiles $syntable ".join(" ",@files)."'\n(Careful: This will generate a file foreach line in the file $syntable).\n";}

    if ($singles) {
      if($verbose){print STDERR "Adding singles...\n";}
      my $fastas = "'".join("' '",@files)."'";
      system("$po_path/proteinortho_singletons.pl $fastas <'$syntable' >>'$syntable'");
    }

    if (!$nograph && !$useMcl) {
      system("$po_path/proteinortho_graphMinusRemovegraph '$rm_syngraph' $syngraph* >'$csyngraph'");
      unless ($keep) {unlink($rm_syngraph);}
      if($verbose){print STDERR "[OUTPUT] -> Orthologous pairs written to $csyngraph\n";}
    }elsif($useMcl){
      if($verbose){print STDERR "[OUTPUT] -> Orthologous pairs are written to $csyngraph\n";}
    }
    if(-x "$po_path/proteinortho_summary.pl"){
      system("$po_path/proteinortho_summary.pl '$csyngraph' >>'$csyngraph.summary'");
    }
    
    system("perl $po_path/proteinortho2html.pl $syntable ".join(" ",@files)." >$syntablehtml");
    if($verbose){print STDERR "[OUTPUT] -> Orthologous groups are written to $syntablehtml\n";}

    if ($doxml) {
      system("perl $po_path/proteinortho2xml.pl $syntable >$syntable.xml");
      if ($? != 0) {system("rm $syntable.xml"); &Error("$ORANGE\n[WARNING]$NC -> proteinortho2xml.pl failed");}
      if($verbose){print STDERR "[OUTPUT] -> Generated the ortho-xml file $syntable.xml";}
    }
  }
}

sub print_header {
  if($verbose){print STDERR "*****************************************************************\n";
  print STDERR "$GREEN"."Proteinortho$NC with PoFF version $version - An orthology detection tool\n",
  "*****************************************************************\n";}
}


sub print_usage {
print STDERR "
     |
  ${BLUE}  /${NC} ${RED2}\\  $NC
  ${BLUE} /\\${NC}${RED2} /\\ $NC
  ${BLUE}/ ${RED2}/${BLUE} \\ ${RED2}\\${NC}

Usage: proteinortho6.pl [OPTIONS] FASTA1 FASTA2 [FASTA...] (one for each species, at least 2)
Options:
         [General options]
         -project=    prefix for all result file names [default: myproject]
         -cpus=       number of processors to use [default: auto]
         -ram=        maximal used ram threshold for LAPACK and the input graph in MB [default: 75% of the total memory]
         -silent      sets verbose to 0
         -temp=       path for temporary files [default: working directory]
         -keep        stores temporary blast results for reuse
         -force       forces the recalculation of the blast results in any case in step=2. Also forces the recreation of the database generation in step=1
         -clean       remove all unnecessary files after processing
         -step=       1 -> generate indices
                      2 -> run blast (and ff-adj, if -synteny is set)
                      3 -> clustering
                      0 -> all (default)

         [Search options]
         -p=          blast program [default: diamond]
                      {blastp|blastn|tblastx|blastp_legacy|blastn_legacy|tblastx_legacy|diamond|usearch|ublast|lastp|lastn|rapsearch|topaz|blatp|blatn|mmseqsp|mmseqsn}
                      The suffix 'p' or 'n' indicates aminoacid fasta files (p) or nucleotide fasta files (n).
                      The suffix '_legacy' indicates legacy blastall (otherwise blast+ is used).
         -e=          E-value for blast [default: 1e-05]

         [Synteny options]
         -synteny     activate PoFF extension to separate similar sequences print
                      by contextual adjacencies (requires .gff for each .fasta)
         -alpha=      PoFF: weight of adjacencies vs. sequence similarity
                      (default: 0.5)

         [Clustering options]
         -singles     report singleton genes without any hit
         -conn=       min. algebraic connectivity [default: 0.1]
         -xml        produces an OrthoXML formatted file of the *.proteinortho.

         (...) show more with --help

For more information see the man page: 'proteinortho -man' or online: https://gitlab.com/paulklemm_PHD/proteinortho
Or you can use the GUI proteinorthoHelper.html (available at http://lechnerlab.de/proteinortho/)
Do you have suggestions or need more help: write a mail to lechner\@staff.uni-marburg.de.

";
}

sub print_usage_more {
print STDERR "
     |
  ${BLUE}  /${NC} ${RED2}\\  $NC
  ${BLUE} /\\${NC}${RED2} /\\ $NC
  ${BLUE}/ ${RED2}/${BLUE} \\ ${RED2}\\${NC}

Usage: proteinortho6.pl [OPTIONS] FASTA1 FASTA2 [FASTA...] (one for each species, at least 2)
Options:
         [General options]
         -project=    prefix for all result file names [default: myproject]
         -cpus=       number of processors to use [default: auto]
         -ram=        maximal used ram threshold for LAPACK and the input graph in MB [default: 75% of the total memory]
         -silent      sets verbose to 0
         -temp=       path for temporary files [default: working directory]
         -keep        stores temporary blast results for reuse
         -force       forces the recalculation of the blast results in any case in step=2. Also forces the recreation of the database generation in step=1
         -clean       remove all unnecessary files after processing
         -step=       1 -> generate indices
                      2 -> run blast (and ff-adj, if -synteny is set)
                      3 -> clustering
                      0 -> all (default)

         [Search options]
         -p=          blast program [default: diamond]

                      {blastp|blastn|tblastx|blastp_legacy|blastn_legacy|tblastx_legacy|diamond|usearch|ublast|lastp|lastn|rapsearch|topaz|blatp|blatn|mmseqsp|mmseqsn}

                      The program need to be installed first.
                      A suffix 'p' or 'n' indicates aminoacid fasta files (p) or nucleotide fasta files (n).
                      The '_legacy' suffix indicates legacy blastall (otherwise blast+ is used).

                        blast*|tblastx : standard blast+ family (blastp : protein files, blastn : dna files)
                        blast*_legacy : legacy blast family (blastall)
                        diamond : Only for protein files! standard diamond procedure and for genes/proteins of length >40 with the additional --sensitive flag
                        usearch : usearch_local procedure with -id 0 (minimum identity percentage).
                        ublast : usearch_ublast procedure.
                        lastn : standard lastal. Only for dna files!
                        lastp : lastal using -p and BLOSUM62 scoring matrix. Only for protein files!
                        rapsearch : Only for protein files!
                        topaz : Only for protein files!
                        blat* : Blat family. blatp : For protein files! blatn : For dna files! blatx : For dna files!
                        mmseqs* : mmseqs family. mmseqsp : For protein files! mmseqsn : For dna files! blatx : For dna files!

         -e=          E-value for blast [default: 1e-05]
         -selfblast   apply selfblast, detects paralogs without orthologs
         -sim=        min. similarity for additional hits (0..1) [default: 0.95]
         -identity=   min. percent identity of best blast hits [default: 25]
         -cov=        min. coverage of best blast alignments in % [default: 50]
         -subpara=    additional parameters for the search tool
                      example -subpara='-seg no' or -subpara='--more-sensitive' for diamond

         [Synteny options]
         -synteny     activate PoFF extension to separate similar sequences print
                      by contextual adjacencies (requires .gff for each .fasta)
         -dups=       PoFF: number of reiterations for adjacencies heuristic,
                      to determine duplicated regions (default: 0)
         -cs=         PoFF: Size of a maximum common substring (MCS) for
                      adjacency matches (default: 3)
         -alpha=      PoFF: weight of adjacencies vs. sequence similarity
                      (default: 0.5)

         [Clustering options]
         -singles     report singleton genes without any hit
         -conn=       min. algebraic connectivity [default: 0.1]
         -xml         produces an OrthoXML formatted file of the *.proteinortho.tsv file.
         -purity=     avoid spurious graph assignments, the higher the more uncertain edges are cut [0-1, default: 1e-07]
         -mcl         enables the mcl algorithm for clustering instead of power iteration or lapacks routine. (needs mcl to be installed) 
         -nograph     do not generate .proteinortho-graph file (pairwise orthology relations)

         [Misc options]
         -desc        write description files (for NCBI FASTA input only)
         -debug       gives detailed information for bug tracking
         -binpath=    path to your directory of local programs that are not available globally (this should not be needed)

         [Large compute jobs]
          In case jobs should be distributed onto several machines, use
         -jobs=M/N    N defines the number of defined job groups (e.g. PCs)
                      M defines the set of jobs to run in this process

         Example:     run with 
                        -step=1
                      to prepare data then to split a run on two machines use
                        -jobs=1/2 -step=2 on PC one and
                        -jobs=2/2 -step=2 on PC two
                      finally run with
                        -step=3 to finalize

For more information see the man page: 'proteinortho -man' or online: https://gitlab.com/paulklemm_PHD/proteinortho
Or you can use the GUI proteinorthoHelper.html (available at http://lechnerlab.de/proteinortho/)
Do you have suggestions or need more help: write a mail to lechner\@staff.uni-marburg.de.

";
}

# -memory=     the amount of ram used for partioning in MB [default 75% of the total memory]

sub init_graph_output {
# if (-e $graph) {
#   &Error("Graph output file '$graph' already exists.");
# }
  open(GRAPH,">$simgraph") || &Error("Could not open graph '$simgraph': $!");
  print GRAPH "# file_a\tfile_b\n# a\tb\tevalue_ab\tbitscore_ab\tevalue_ba\tbitscore_ba\n";
  close(GRAPH);

  unless ($synteny) {return;}
# if (-e $syn) {
#   &Error("Synteny Graph output file '$syn' already exists.");
# }
  open(SYN,">$syngraph") || &Error("Could not open graph '$syngraph': $!");
  print SYN "# file_a\tfile_b\n# a\tb\tevalue_ab\tbitscore_ab\tevalue_ba\tbitscore_ba\tsame_strand\tsimscore\n";
  close(SYN);
}

sub set_threads_per_process {
  lock($jobs_done);
  my $willdo = ($jobs_todo-$jobs_done+$_[0]);

  if ($debug) {
    print STDERR "\nTODO: $jobs_todo DONE: $jobs_done Running: $_[0] -> $willdo\n";
  }

  if ($willdo < 1) {return;}

  my $optimal = int($cpus/$willdo);
  lock($threads_per_process);
  if ($optimal > $threads_per_process) {
    $threads_per_process = $optimal;
    if ($debug) {
      print STDERR "\nBlast subthreads was set to $threads_per_process\n";
    }
  }
}

sub run_blast {
  # Jobs todo
  $jobs_todo = 0;
  for (my $i = 0; $i < scalar(@files)-1+$selfblast; $i++) {$jobs_todo += scalar(@files)-$i-1+$selfblast;}

  # Divide 5.16
  $part = int($jobs_todo/$split_to_X_jobs); # Round up to not miss anything
# print STDERR "$jobs_todo/$split_to_X_jobs = $part\n";
  my $from = ($jobnumber-1)*($part)+1;
  if ($jobnumber == 1) {$from = 1;}
  my $to = ($jobnumber-1)*($part)+$part;
  if ($jobnumber == $split_to_X_jobs) {$to = $jobs_todo;}

  $part = 1+$to-$from;  # real part

  if ($split_to_X_jobs != -1 && $part < 1) {&Error("I have problems coordinating $split_to_X_jobs groups for $jobs_todo jobs.");}
  if ($split_to_X_jobs <= 1) {$from = -1; $to = -1; $split_to_X_jobs = -1; $part = -1;}

# print STDERR "$from - $to (TODO: $jobs_todo)\n";

  &set_threads_per_process(0);  # Check if we can apply more threads, nothing is running so far
  &print_blast_stats();

  # Spawn worker threads
  for (my $i = 0; $i < $cpus; $i++) {threads->create('workerthread');}

  # For each file against each other file
  my $job_number = 0;
  SPEC: for (my $i = 0; $i < scalar(@files)-1+$selfblast; $i++) {
    for (my $j = $i+1-$selfblast; $j < scalar(@files); $j++) {
      $job_number++;
      if ($from != -1 && $job_number < $from) {next;}
      # Wait for queque to get empty (with some buffer)
      while ($jobque->pending() > 32 + 2*$cpus) {sleep(1);}
      # Syncronize with other processes
      $jobque->enqueue("$i $j");
#     print STDERR "EN: $job_number -> $i $j\n";
      if ($to != -1 && $job_number >= $to) {last SPEC;}
    }
  }
  # Tell all threads they can stop
  {lock($all_jobs_submitted); $all_jobs_submitted = 1;}

  # Wait until all jobs are done
  foreach (threads->list())   {$_->join();}
  &print_blast_stats();   if($verbose){print STDERR "\n";}
  if($verbose){print STDERR "[OUTPUT] -> written to $simgraph\n";}
}

sub workerthread {

  my $thread_id = threads->tid();
  my $temp_file = "$tmp_path$project-$run_id-$thread_id";

  $temp_file=~s/^\.\///;

  # Clean up, just to be safe
  unlink("$temp_file.tmp");
  unlink("$temp_file.log");
  unlink("$temp_file.matching");

  while (1) {
    my ($i, $j);
    while (1) {
      # Fetch new jobs
      my $job = $jobque->dequeue_nb();
      # If there is nothing
      unless (defined($job)) {
        # Check if more jobs need to be done

        {
        lock($jobs_done);             #   Productive
        if ($jobs_done >= $jobs_todo || ($part != -1 && $jobs_done >= $part)) { #   Productive
#       lock($all_jobs_submitted);            #   DEBUGGING
#       if ($all_jobs_submitted) {            #   DEBUGGING
          if ($debug) {print STDERR "Thread $thread_id\tis leaving\n";}
          return;
        }}
        # If so, wait
          if ($debug) {print STDERR "Thread $thread_id\tis seeking work ($jobs_done / $jobs_todo)\n";}
        sleep(1);
      }
      else {
        # Parse job
        ($i, $j) = split(" ",$job);
        # Break the fetch loop
        last;
      }
    }

    my $file_i = $files[$i];
    my $file_j = $files[$j];
    my $short_file_i = $file_i; $short_file_i =~ s/^.*\///;
    my $short_file_j = $file_j; $short_file_j =~ s/^.*\///;

    # Work
    &set_threads_per_process(scalar(threads->list()));
    my $result_ij = &blast($file_i,$file_j,$thread_id);

    my $result_ji;
    if ($file_i eq $file_j) {
      # One run is enough (selfblast)
      $result_ji = $result_ij;
    }
    else {
      $result_ji = &blast($file_j,$file_i,$thread_id);
    }

    if ($file_i eq $file_j && !$selfblast) {&Error("Selfblast is disabled but I want to check $file_i vs $file_j");}

    my %lengths;
    if ($file_i eq $file_j) {
      # Get lengths once is enough (selfblast)
      %lengths = %{&get_gene_lengths($file_i)};
    }
    else {
      %lengths = %{&get_gene_lengths($file_i,$file_j)};
    }
    my %reciprocal_matches = %{&match(\%lengths,$result_ij,$result_ji)};

    # Remove secondary hits if better exist (test here instead of later)
    %reciprocal_matches = %{&adaptive_best_blast_matches(\%reciprocal_matches)};

    if ($synteny) {
      my ($ordered_matches, $track_pointer, $close_copies_pointer) = &synteny_matches(\%reciprocal_matches,$file_i,$file_j);
      open(PREGRAPH,">>$temp_file.tmp") || &Error("Could not open temp file '$temp_file.tmp': $!");
      print PREGRAPH $ordered_matches;
      close(PREGRAPH);
      my $ffadj_param = "-a $alpha";
      if ($duplication) { $ffadj_param .= " -R $duplication -M $cs";}
      my $cmd = "$po_path/proteinortho_ffadj_mcs.py $ffadj_param '$temp_file.tmp'";
      if ($debug) {print STDERR "$cmd\n";}
      my $synt_stats = qx($cmd);
      $synt_stats=~s/[\r\n]+$//;
      $synt_stats =~ s/#.+\n//;

      # Reverse mapping of full gene ids, two seperate maps in case of overlapps in short ids
      my %full_id_map_i;
      my %full_id_map_j;
      foreach (sort keys %reciprocal_matches) {
        my ($a, $b) = split(/\s/,$_);
        my $aa = &convertNCBI($a);
        my $bb = &convertNCBI($b);
        if ($aa ne $a) {
          if ($debug) {print STDERR "j_map: $aa -> $a\n";}
          $full_id_map_j{$aa} = $a;}
        if ($bb ne $b) {
          if ($debug) {print STDERR "i_map: $bb -> $b\n";}
          $full_id_map_i{$bb} = $b;}
      }

      # Reverse mapping of gene position to short id
      my %track = %{$track_pointer};
      my %close = %{$close_copies_pointer};
      # Generate hash for synteny hits
      my %synteny;

      unless (-s "$temp_file.matching") {
        print STDERR "\n$RED [Error] Failed to run $po_path/proteinortho_ffadj_mcs.py for\n$file_i vs $file_j\nMoving source to $temp_file.err for debugging\nI will continue, but results may be insufficient.$NC \n\n";
        system("mv $temp_file.tmp $temp_file.err");
        next;
      }

      open(OSYNGRAPH,"<$temp_file.matching") || &Error("Could not open temp file $temp_file.matching: $!'");
      while(<OSYNGRAPH>) {
          $_=~s/[\r\n]+$//;
          my ($i, $j, $score) = split(/\s+/,$_,3);
          if (!defined($score) || $i =~ /[^0-9]/ || $i == 0 || length($i) > 10) {next;}
          unless (defined($track{$file_i.$i})) {
            print STDERR "Could not find i: ".$file_i.$i."\n";  next;
          }
          unless (defined($track{$file_j.$j})) {
            print STDERR "Could not find j: ".$file_j.$j."\n";  next;
          }
          # Remap to full ID
          my $a = $track{$file_i.$i};
          if (defined($full_id_map_i{$a})) {$a = $full_id_map_i{$a};}
          my $b = $track{$file_j.$j};
          if (defined($full_id_map_j{$b})) {$b = $full_id_map_j{$b};}
          # Store
          $synteny{"$b $a"} = $score;

          # Close copies
          if ($neighbourjoin && defined($close{$i})) {
            my @partners = split(',',$close{$i});
            foreach (@partners) {
              my $c = $track{$file_i.$_};
              if (defined($full_id_map_i{$c})) {$c = $full_id_map_i{$c};}
              # Store
              $synteny{"$b $c"} = $score;
              if ($debug) {print STDERR "Storing addional proximity edge $a & $b -> $c\n";}
            }
          }
        }
        close(OSYNGRAPH);
        unlink("$temp_file.tmp");
        unlink("$temp_file.log");
        unlink("$temp_file.matching");

      {
      lock($syn_lock);

      open(SYN,">>$syngraph") || &Error("Could not open file '$syngraph': $!");
      print SYN "# $short_file_j\t$short_file_i\n";
      print SYN "# Scores: $synt_stats\n";
      foreach (sort keys %reciprocal_matches) {
        if (!defined($synteny{$_})) {if ($debug) {print STDERR "FAIL: $_\n";} next;}    # Not reported by gene-order algo
        my $line = "$_ ".$reciprocal_matches{$_}." $synteny{$_}";
        $line =~ s/ /\t/g;
        print SYN "$line\n";
      }
      close(SYN);
      }

    }

    {
      lock($graph_lock);

      my @arr0; #evalue ab
      my @arr1; #biscore ab
      my @arr2; #evalue ba
      my @arr3; #bitscore ba
      foreach (sort keys %reciprocal_matches) {
        my @spl=split(" ",$reciprocal_matches{$_});
        push(@arr0,$spl[0]);
        push(@arr1,$spl[1]);
        push(@arr2,$spl[2]);
        push(@arr3,$spl[3]);
      }

      open(GRAPH,">>$simgraph") || &Error("Could not open file '$simgraph': $!");
      print GRAPH "# $short_file_j\t$short_file_i\n# ".median(@arr0)."\t".median(@arr1)."\t".median(@arr2)."\t".median(@arr3)."\n";
      foreach (sort keys %reciprocal_matches) {
        my $line = "$_ ".$reciprocal_matches{$_};
        $line =~ s/ /\t/g;
        print GRAPH "$line\n";
      }
      close(GRAPH);
    }
    # Count
    {
      lock($jobs_done);
      $jobs_done++;
    }
    &print_blast_stats(); # Needs jobs_done to be free
  }
}

sub sum {
  my $ret = 0;
  foreach (@_){
    $ret += $_;
  }
  return $ret;
}
sub median {
  sum( ( sort { $a <=> $b } @_ )[ int( $#_/2 ), ceil( $#_/2 ) ] )/2;
}

sub identitybylength {
        # Accoding to the formula of Rost, 1999 (Twilight-zone)
  if($_[0] <= 11)   {return 100;}
  if($_[0] <= 450)  {return 480*$_[0]**(-0.32*(1+exp(-$_[0]/1000)));}
  return 19.5;
}

sub adaptive_best_blast_matches {
  my %reciprocal_matches = %{(shift)};

  if ($debug) {
    print STDERR "\nStart with ";
    print STDERR scalar(keys %reciprocal_matches);
    print STDERR " edges\n";
  }

  my %best;
  my %best_gen;
  # Gather best
  foreach (keys %reciprocal_matches) {
    my ($a,$b) = split(" ",$_);
    my ($evalue_ab,$bitscore_ab,$evalue_ba,$bitscore_ba) = split(" ",$reciprocal_matches{$_});
    if (!defined($best{$a}) || $best{$a} < $bitscore_ab) {
      $best{$a} = $bitscore_ab;
      $best_gen{$a} = $b;
    }
    if (!defined($best{$b}) || $best{$b} < $bitscore_ba) {
      $best{$b} = $bitscore_ba;
      $best_gen{$b} = $a;
    }
  }

  if ($debug) {
    my %gene_num;
    # Count gene number
    foreach (keys %reciprocal_matches) {
      my ($a,$b) = split(" ",$_);
      $gene_num{$a}++;
      $gene_num{$b}++;
    }
    print STDERR "Number of genes: ".scalar(keys %gene_num)."\n";

    foreach (keys %best) {
      print STDERR "Best score for $_ is $best{$_} ($best_gen{$_})\n";
    }
  }

  # Remove worse
  foreach (keys %reciprocal_matches) {
    my ($a,$b) = split(" ",$_);
    my ($evalue_ab,$bitscore_ab,$evalue_ba,$bitscore_ba) = split(" ",$reciprocal_matches{$_});
    if  ($best{$a}*$sim > $bitscore_ab) {delete $reciprocal_matches{$_}; if ($debug) {my $v = $bitscore_ab/$best{$a}; print STDERR "Removed $_ because $best{$a} vs $bitscore_ab ($v)\n";}}
    elsif   ($best{$b}*$sim > $bitscore_ba) {delete $reciprocal_matches{$_}; if ($debug) {my $v = $bitscore_ba/$best{$b}; print STDERR "Removed $_ because $best{$b} vs $bitscore_ba ($v)\n";}}
  }

  if ($debug) {
  print STDERR "\nEnd with ";
  print STDERR scalar(keys %reciprocal_matches);
  print STDERR " edges\n";

  my %gene_num;
  # Count gene number
  foreach (keys %reciprocal_matches) {
    my ($a,$b) = split(" ",$_);
    $gene_num{$a}++;
    $gene_num{$b}++;
  }
  print STDERR "Number of genes: ".scalar(keys %gene_num)."\n";}

  return \%reciprocal_matches;
}

sub synteny_matches {
  my %reciprocal_matches = %{(shift)};
  my $file_i = shift;
  my $file_j = shift;

  # Get order for both species (same hash as ids are non overlapping)
  my %order;
  my %track;
  for my $file ($file_i, $file_j) {
    # Get Coordinates for all genes
    my %coords = %{&read_details($file)};
    my $counter = 0;
    # Number them according to their order
    foreach my $id (sort
      {
        my @a = split("\t",$coords{$a});
        my @b = split("\t",$coords{$b});

#       #chr strand pos
#       if ($a[0] ne $b[0]) {return $a[0] cmp $b[0];}
#       if ($a[1] ne $b[1]) {return $a[1] cmp $b[1];}
#       return $a[2] <=> $b[2];

        #chr pos
        if ($a[0] ne $b[0]) {return $a[0] cmp $b[0];}
        return $a[2] <=> $b[2];
      } (keys(%coords))) {
      my @v = split("\t",$coords{$id});
      $order{$id} = ++$counter."\t".$v[1];  # Store strand info
      $track{$file.$counter} = $id;   # Reverse Mapping
    }
  }

  my $output = "";

  my @multis; # array that contains all multi-edges
  # Convert reciprocal matches to ffadj input
  foreach (keys %reciprocal_matches) {
    my @values = split(/\s/,$reciprocal_matches{$_});
    my ($a, $b) = split(/\s/,$_);
    unless (defined($order{$a})) {$a = &convertNCBI($a);}
    unless (defined($order{$b})) {$b = &convertNCBI($b);}
    my @a = split(/\s/,$order{$a});
    my @b = split(/\s/,$order{$b});
    unless (defined($a[0])) {&Error("Failed parsing gene IDs from blast output/gff input\n");}

    unless (defined($multis[$a[0]])) {$multis[$a[0]] = $b[0];}
    else         {$multis[$a[0]] .= ','.$b[0];}

    $output .= "$b[0]\t$a[0]\t";      # Positions
    if ($a[1] eq $b[1])   {$output .= "1\t";} # Same strand?
    else      {$output .= "-1\t";}
    my $score = (&edgeweight($values[0])+&edgeweight($values[2]))/2;  # Score made from e-values
    $output .= $score."\n";
  }

  # Check multis
  my %close_copies;
  if ($neighbourjoin) {
    for (my $i = 1; $i < scalar(@multis); $i++) {
      unless (defined($multis[$i])) {next;}
      my @partners = sort { $a <=> $b } split(',',$multis[$i]);
      if (scalar(@partners) <= 1) {next;}
      my $dist_limit = 2; # How far can tandem copies be away from each other? (0/1 = off, 2 = immediate, ...
      my $last = 999999999999999;
      foreach my $new (@partners) {
        if (abs($last-$new) < $dist_limit) {
          if (!defined($close_copies{$last}))   {$close_copies{$last} = $new;}
          else          {$close_copies{$last} .= ','.$new;}
          $close_copies{$new} = $last;    # The list is sortet, so we are here for the frist time
        }
        $last = $new;
      }
    }
  }

  return ($output, \%track, \%close_copies);
}

# Read the length of all genes in a given set of fasta files
sub get_gene_lengths {
  my %lengths;
  while (my $file = shift @_) {
    my $last_gene = "";
    open(FASTA,"<$file") || &Error("Could not open '$file': $!");
    while (<FASTA>) {
      $_=~s/[\r\n]+$//;
      if ($_ =~ />/) {
        $_ =~ s/^>//; # remove the starting >
        $_ =~ s/\s.*//; # trim the description

        # if ($blastmode eq "mmseqsn" || $blastmode eq "mmseqsp") {
    #      $_ =~ s/[^|]+\|([^|]+)\|[^|]+/$1/;
    #      $_ =~ s/[^|]+\|[^|]+\|([^|]+)/$1/;
    #      $_ =~ s/[^|]+\|([^|]+)/$1/;
        # }
        $lengths{$_} = 0;
        $last_gene = $_;

      }
      else {
        $lengths{$last_gene} += length($_);

      }
    }
    close(FASTA);
  }

  return \%lengths;
}

sub print_blast_stats {
  if (!$verbose) {return;}
  {
    if ($jobs_todo == 0) {&Error("Nothing to do. This should not happen! Please submit the FASTA file(s) and the parameter vector (above) to incoming+paulklemm-phd-proteinortho-7278443-issue-\@incoming.gitlab.com to help fixing this issue. ");}
    lock($jobs_done);
    my $percent = int($jobs_done/$jobs_todo*10000)/100;
    print STDERR "\r                                                                               ";

    if ($split_to_X_jobs == -1) {
      print STDERR "\rRunning blast analysis: $percent% ($jobs_done/$jobs_todo)";
    }
    else {  # 5.16
      $percent = int($jobs_done/$part*10000)/100;
      print STDERR "\rRunning blast analysis: $percent% ($jobs_done/$part, $jobs_todo in total)";
    }
  }
}

sub match {
  my %length = %{(shift)};
  my @i = @{(shift)};
  my @j = @{(shift)};

  my %legal_i = &get_legal_matches(\%length,@i);
  my %legal_j = &get_legal_matches(\%length,@j);

  return &get_reciprocal_matches(\%legal_i,\%legal_j);
}

sub get_reciprocal_matches {
  my %i = %{(shift)};
  my %j = %{(shift)};
  my %reciprocal;

  foreach (keys %i) {
    my ($i, $j) = split(" ",$_);

    # Reciprocal hit?
    if (!defined($j{"$j $i"})) {next;}

    # Merge both
    $reciprocal{$_} = $i{$_}." ".$j{"$j $i"}; # evalue_ij, bitscore_ij, evalue_ji, bitscore_ji
  }

  return \%reciprocal;
}

sub get_legal_matches {
  my %length = %{(shift)};

  my %result;
  foreach (@_) {
    my ($query_id,$subject_id,$local_identity,$alignment_length,$mismatches,$openings,$query_start,$query_end,$subject_start,$subject_end,$local_evalue,$local_bitscore) = split(/\s+/);

    if($blastmode eq "mmseqsp" || $blastmode eq "mmseqsn"){$local_identity = $local_identity*100;}

    if ($debug) {print STDERR "?$query_id -> $subject_id ($local_evalue,$local_bitscore)\n";}

    # Bug tracking
    unless (defined($length{$query_id}))  {if($query_id ne '#'){print STDERR "$RED [ERROR] Query gene ID '$query_id' is present in blast output but was not present in FASTA input! Skipping line. I will continue, but results may be insufficien $NC \n";} next;}
    unless (defined($length{$subject_id}))  {if($query_id ne '#'){print STDERR "$RED [ERROR] Subject gene ID '$subject_id' is present in blast output but was not present in FASTA input! Skipping line. I will continue, but results may be insufficien $NC \n";} next;}

    ## Check for criteria
    # Well formatted
    if (!defined($local_bitscore))              {next;}
    if ($evalue < $local_evalue) {next;} # 5.17, post filter e-value

# if(($blastmode ne "phmmer" && $blastmode ne "jackhmmer")){ #workaround for jackhmmer + phmmer
    # Percent identity
    if (!$twilight && $local_identity < $identity)          {if ($debug) {print STDERR "!$query_id -> $subject_id is removed because of identity threshold\n";} next;}
    if ( $twilight && $local_identity < &identitybylength($alignment_length)) {if ($debug) {print STDERR "!$query_id -> $subject_id is removed because of identity by length threshold\n";} next;}
    # Min. length
    if ($blastmode eq "tblastx+" || $blastmode eq "tblastx") {$alignment_length *= 3;}
    if ($alignment_length < $length{$query_id}*($coverage/100)+0.5)     {if ($debug) {print STDERR "!$query_id -> $subject_id is removed because of coverage threshold\n";} next;}
    if ($alignment_length < $length{$subject_id}*($coverage/100)+0.5)     {if ($debug) {print STDERR "!$query_id -> $subject_id is removed because of coverage threshold\n";} next;}

    # It hit itself (only during selfblast)
    # if ($selfblast && $query_id eq $subject_id)           {next;} # 5.16 reuse IDs
    ## Listing them in the graph is okay, clustering will ignore them
# }
    # Similar hits? Take the better one
    if (defined($result{"$query_id $subject_id"})) {
      my ($remote_evalue, $remote_bitscore) = split(" ",$result{"$query_id $subject_id"});
      if ($local_evalue > $remote_evalue) {next;}
      if ($local_bitscore < $remote_bitscore) {next;}
    }

    # Store data
    if ($debug) {print STDERR "!$query_id -> $subject_id ($local_evalue,$local_bitscore)\n";}
    $result{"$query_id $subject_id"} = "$local_evalue $local_bitscore";
  }

  return %result;
}

# Auto set the number of CPUs
sub auto_cpus {
  if ($cpus == 0) {
    my $cpu_x = qx(getconf _NPROCESSORS_ONLN);    $cpu_x =~ s/[^0-9]//g;
    # Fallback
    if (length($cpu_x) == 0 || $cpu_x == 0) {
      # Linux
      if (-e "/proc/cpuinfo") {
        $cpu_x = qx(grep processor /proc/cpuinfo | wc -l);
      }
      # Try Mac
      else {
        $cpu_x = qx(system_profiler | grep CPUs:);
      }
      $cpu_x =~ s/[^0-9]//g;
    }
    if (length($cpu_x) == 0 || $cpu_x == 0) {
      print STDERR "failed! Use 1 core only\n";$cpu_x = 1;
    }
    if($verbose){print STDERR "Detected $cpu_x available CPU threads, ";}
    $cpus = $cpu_x;
  }
  else {
    if($verbose){print STDERR "Using $cpus CPU threads, ";}
  }
}

sub generate_indices {
  my $oldkeep=$keep;
  if($verbose){print STDERR "Generating indices";if($force){print STDERR " anyway (forced).\n"}else{print STDERR ".\n";}}
  if ($blastmode eq "rapsearch") {
    foreach my $file (@_) {
      if ($file =~ /\s/) {print STDERR ("$ORANGE\n[WARNING]$NC : File name '$file' contains whitespaces. This might lead to undesired effects. If you encounter unusual behaviours, please change the file name!$NC\n");}

      if(!$force && join("",glob("$file.$blastmode*")) ne ""){
        if ($verbose) {print STDERR "The database for '$file' is present and will be used\n";}
      }else{
        if ($verbose) {print STDERR "Building database for '$file'\t(".$gene_counter{$file}." sequences)\n";}
        system("$makedb -d '$file' -n '$file.$blastmode' >\/dev\/null");
        if ($? != 0) {print STDERR ("$ORANGE\n[WARNING]$NC ".$blastmode." failed to create a database. Most likely you don't have write permissions in the directory of the fasta files. I will now proceed with writing the database files to the DB/ directory in $tmp_path (-tmp)."); if($step==1){print STDERR "$ORANGE Please ensure that you use -tmp=$tmp_path -keep for future analysis.$NC";}print "\n";

          mkdir("$tmp_path/DB");
          if($step==1){$oldkeep=$keep;$keep=1;}
          system("$makedb -d '$file' -n '$tmp_path/DB/".basename($file).".$blastmode' >\/dev\/null");

          if($?!=0){ $keep=$oldkeep; &Error("The database generation failed once again, please retry with 'sudo' or move the fasta files to a directory with write permissions. If this failes too, then there is something wrong with the fasta files or the version of $blastmode cannot handle the database generation. So please try one of the following:\n- update $blastmode\n- consider another blast algorithm (-p)\n- consider to submitting (mailing) this case to incoming+paulklemm-phd-proteinortho-7278443-issue-\@incoming.gitlab.com.");}
        }
      }
    }
  }
  elsif ($blastmode eq "diamond" ) {
    foreach my $file (@_) {
      if ($file =~ /\s/) {&Error("File name '$file' contains whitespaces. This might lead to undesired effects. If you encounter unusual behaviours, please change the file name!\n");}
      if(!$force && join("",glob("$file.$blastmode*")) ne ""){
        if ($verbose) {print STDERR "The database for '$file' is present and will be used\n";}
      }else{
        if ($verbose) {print STDERR "Building database for '$file'\t(".$gene_counter{$file}." sequences)\n";}
        system("$makedb '$file' -d '$file.$blastmode' --quiet >\/dev\/null 2>\/dev\/null");
        if ($? != 0) {print STDERR ("$ORANGE\n[WARNING]$NC ".$blastmode." failed to create a database. Most likely you don't have write permissions in the directory of the fasta files. I will now proceed with writing the database files to the DB/ directory in $tmp_path (-tmp)."); if($step==1){print STDERR "$ORANGE Please ensure that you use -tmp=$tmp_path -keep for future analysis.$NC";}print "\n";

          mkdir("$tmp_path/DB");
          if($step==1){$oldkeep=$keep;$keep=1;}
          system("$makedb '$file' -d '$tmp_path/DB/".basename($file).".$blastmode' --quiet >\/dev\/null 2>\/dev\/null");

          if($?!=0){ $keep=$oldkeep; &Error("The database generation failed once again, please retry with 'sudo' or move the fasta files to a directory with write permissions. If this failes too, then there is something wrong with the fasta files or the version of $blastmode cannot handle the database generation. So please try one of the following:\n- update $blastmode\n- consider another blast algorithm (-p)\n- consider to submitting (mailing) this case to incoming+paulklemm-phd-proteinortho-7278443-issue-\@incoming.gitlab.com.");}
        }
      }
    }
  }
  elsif ($blastmode eq "topaz") {
    foreach my $file (@_) {
      if ($file =~ /\s/) {&Error("File name '$file' contains whitespaces. This might lead to undesired effects. If you encounter unusual behaviours, please change the file name!\n");}

      if(!$force && join("",glob("$file.$blastmode*")) ne ""){
        if ($verbose) {print STDERR "The database for '$file' is present and will be used\n";}
      }else{
        if ($verbose) {print STDERR "Building database for '$file'\t(".$gene_counter{$file}." sequences)\n";}
        system("$makedb index -f '$file' -p '$file.$blastmode' >\/dev\/null 2>\/dev\/null");
        if ($? != 0) {print STDERR ("$ORANGE\n[WARNING]$NC ".$blastmode." failed to create a database. Most likely you don't have write permissions in the directory of the fasta files. I will now proceed with writing the database files to the DB/ directory in $tmp_path (-tmp)."); if($step==1){print STDERR "$ORANGE Please ensure that you use -tmp=$tmp_path -keep for future analysis.$NC";}print "\n";

          mkdir("$tmp_path/DB");
          if($step==1){$oldkeep=$keep;$keep=1;}
          system("$makedb index -f '$file' -p '$tmp_path/DB/".basename($file).".$blastmode' >\/dev\/null 2>\/dev\/null");

          if($?!=0){ $keep=$oldkeep; &Error("The database generation failed once again, please retry with 'sudo' or move the fasta files to a directory with write permissions. If this failes too, then there is something wrong with the fasta files or the version of $blastmode cannot handle the database generation. So please try one of the following:\n- update $blastmode\n- consider another blast algorithm (-p)\n- consider to submitting (mailing) this case to incoming+paulklemm-phd-proteinortho-7278443-issue-\@incoming.gitlab.com.");}
        }
      }
    }
  }
  elsif ($blastmode eq "mmseqsp") {
    foreach my $file (@_) {
      if ($file =~ /\s/) {&Error("File name '$file' contains whitespaces. This might lead to undesired effects. If you encounter unusual behaviours, please change the file name!\n");}

      if(!$force && join("",glob("$file.$blastmode*")) ne ""){
        if ($verbose) {print STDERR "The database for '$file' is present and will be used\n";}
      }else{
        if ($verbose) {print STDERR "Building database for '$file'\t(".$gene_counter{$file}." sequences)\n";}
        system("$makedb --dbtype 1 '$file' '$file.$blastmode' >\/dev\/null 2>\/dev\/null");
        if ($? != 0) {print STDERR ("$ORANGE\n[WARNING]$NC ".$blastmode." failed to create a database. Most likely you don't have write permissions in the directory of the fasta files. I will now proceed with writing the database files to the DB/ directory in $tmp_path (-tmp)."); if($step==1){print STDERR "$ORANGE Please ensure that you use -tmp=$tmp_path -keep for future analysis.$NC";}print "\n";

          mkdir("$tmp_path/DB");
          if($step==1){$oldkeep=$keep;$keep=1;}
          system("$makedb --dbtype 1 '$file' '$tmp_path/DB/".basename($file).".$blastmode' >\/dev\/null 2>\/dev\/null");

          if($?!=0){ $keep=$oldkeep; &Error("The database generation failed once again, please retry with 'sudo' or move the fasta files to a directory with write permissions. If this failes too, then there is something wrong with the fasta files or the version of $blastmode cannot handle the database generation. So please try one of the following:\n- update $blastmode\n- consider another blast algorithm (-p)\n- consider to submitting (mailing) this case to incoming+paulklemm-phd-proteinortho-7278443-issue-\@incoming.gitlab.com.");}
        }
      }
    }
  }
  elsif ($blastmode eq "mmseqsn") {
    foreach my $file (@_) {
      if ($file =~ /\s/) {&Error("File name '$file' contains whitespaces. This might lead to undesired effects. If you encounter unusual behaviours, please change the file name!\n");}

      if(!$force && join("",glob("$file.$blastmode*")) ne ""){
        if ($verbose) {print STDERR "The database for '$file' is present and will be used\n";}
      }else{
        if ($verbose) {print STDERR "Building database for '$file'\t(".$gene_counter{$file}." sequences)\n";}
        system("$makedb --dbtype 2 '$file' '$file.$blastmode' >\/dev\/null 2>\/dev\/null");
        if ($? != 0) {print STDERR ("$ORANGE\n[WARNING]$NC ".$blastmode." failed to create a database. Most likely you don't have write permissions in the directory of the fasta files. I will now proceed with writing the database files to the DB/ directory in $tmp_path (-tmp)."); if($step==1){print STDERR "$ORANGE Please ensure that you use -tmp=$tmp_path -keep for future analysis.$NC";}print "\n";

          mkdir("$tmp_path/DB");
          if($step==1){$oldkeep=$keep;$keep=1;}
          system("$makedb --dbtype 2 '$file' '$tmp_path/DB/".basename($file).".$blastmode' >\/dev\/null 2>\/dev\/null");

          if($?!=0){ $keep=$oldkeep; &Error("The database generation failed once again, please retry with 'sudo' or move the fasta files to a directory with write permissions. If this failes too, then there is something wrong with the fasta files or the version of $blastmode cannot handle the database generation. So please try one of the following:\n- update $blastmode\n- consider another blast algorithm (-p)\n- consider to submitting (mailing) this case to incoming+paulklemm-phd-proteinortho-7278443-issue-\@incoming.gitlab.com.");}
        }
      }
    }
  }
  elsif ($blastmode eq "usearch" || $blastmode eq "ublast") {
    foreach my $file (@_) {
      if ($file =~ /\s/) {&Error("File name '$file' contains whitespaces. This might lead to undesired effects. If you encounter unusual behaviours, please change the file name!\n");}

      if(!$force && join("",glob("$file.$blastmode*")) ne ""){
        if ($verbose) {print STDERR "The database for '$file' is present and will be used\n";}
      }else{
        if ($verbose) {print STDERR "Building database for '$file'\t(".$gene_counter{$file}." sequences)\n";}
        system("$makedb '$file' -output '$file.$blastmode' >\/dev\/null 2>\/dev\/null");
        if ($? != 0) {print STDERR ("$ORANGE\n[WARNING]$NC ".$blastmode." failed to create a database. Most likely you don't have write permissions in the directory of the fasta files. I will now proceed with writing the database files to the DB/ directory in $tmp_path (-tmp)."); if($step==1){print STDERR "$ORANGE Please ensure that you use -tmp=$tmp_path -keep for future analysis.$NC";}print "\n";

          mkdir("$tmp_path/DB");
          if($step==1){$oldkeep=$keep;$keep=1;}
          system("$makedb '$file' -output '$tmp_path/DB/".basename($file).".$blastmode' >\/dev\/null 2>\/dev\/null");

          if($?!=0){ $keep=$oldkeep; &Error("The database generation failed once again, please retry with 'sudo' or move the fasta files to a directory with write permissions. If this failes too, then there is something wrong with the fasta files or the version of $blastmode cannot handle the database generation. So please try one of the following:\n- update $blastmode\n- consider another blast algorithm (-p)\n- consider to submitting (mailing) this case to incoming+paulklemm-phd-proteinortho-7278443-issue-\@incoming.gitlab.com.");}
        }
      }
    }
  }
  elsif ($blastmode eq "lastp" || $blastmode eq "lastn") {
    foreach my $file (@_) {
      if ($file =~ /\s/) {&Error("File name '$file' contains whitespaces. This might lead to undesired effects. If you encounter unusual behaviours, please change the file name!\n");}
      if(!$force && join("",glob("$file.$blastmode*")) ne ""){
        if ($verbose) {print STDERR "The database for '$file' is present and will be used\n";}
      }else{
        if ($verbose) {print STDERR "Building database for '$file'\t(".$gene_counter{$file}." sequences)\n";}
        if($blastmode eq "lastp"){
          system("$makedb -p '$file.$blastmode' '$file'");
          if ($? != 0) {print STDERR ("$ORANGE\n[WARNING]$NC ".$blastmode." failed to create a database. Most likely you don't have write permissions in the directory of the fasta files. I will now proceed with writing the database files to the DB/ directory in $tmp_path (-tmp)."); if($step==1){print STDERR "$ORANGE Please ensure that you use -tmp=$tmp_path -keep for future analysis.$NC";}print "\n";

            mkdir("$tmp_path/DB");
            if($step==1){$oldkeep=$keep;$keep=1;}
            system("$makedb -p '$tmp_path/DB/".basename($file).".$blastmode' '$file'");

            if($?!=0){ &Error("The database generation failed once again, please retry with 'sudo' or move the fasta files to a directory with write permissions. If this failes too, then there is }something wrong with the fasta files or the version of $blastmode cannot handle the database generation. So try to update $blastmode, consider another blast algorithm (-p) or consider to submitting this case to .");}
          }
        }else{
          system("$makedb '$file.$blastmode' '$file'");
          if ($? != 0) {print STDERR ("$ORANGE\n[WARNING]$NC ".$blastmode." failed to create a database. Most likely you don't have write permissions in the directory of the fasta files. I will now proceed with writing the database files to the DB/ directory in $tmp_path (-tmp)."); if($step==1){print STDERR "$ORANGE Please ensure that you use -tmp=$tmp_path -keep for future analysis.$NC";}print "\n";

            mkdir("$tmp_path/DB");
            if($step==1){$oldkeep=$keep;$keep=1;}
            system("$makedb '$tmp_path/DB/".basename($file).".$blastmode' '$file'");

            if($?!=0){ $keep=$oldkeep; &Error("The database generation failed once again, please retry with 'sudo' or move the fasta files to a directory with write permissions. If this failes too, then there is something wrong with the fasta files or the version of $blastmode cannot handle the database generation. So please try one of the following:\n- update $blastmode\n- consider another blast algorithm (-p)\n- consider to submitting (mailing) this case to incoming+paulklemm-phd-proteinortho-7278443-issue-\@incoming.gitlab.com.");}
          }
        }
      }
    }
  }
  elsif ($blastmode =~ m/.*blat.*/) {
    if($verbose){print STDERR "No indices needed\n";}
  }
  elsif ($blastmode =~ m/blast.*\+$/) {  # new blast+
    foreach my $file (@_) {
      if ($file =~ /\s/) {&Error("File name '$file' contains whitespaces. This might lead to undesired effects. If you encounter unusual behaviours, please change the file name!\n");}
      if(!$force && join("",glob("$file.$blastmode*")) ne ""){
        if ($verbose) {print STDERR "The database for '$file' is present and will be used\n";}
      }else{
        if ($verbose) {print STDERR "Building database for '$file'\t(".$gene_counter{$file}." sequences)\n";}
        if ($debug) {print STDERR "$makedb '$file' -out '$file.$blastmode' >\/dev\/null\n";}
        system("$makedb '$file' -out '$file.$blastmode' >\/dev\/null");
        if ($? != 0) {print STDERR ("$ORANGE\n[WARNING]$NC ".$blastmode." failed to create a database. Most likely you don't have write permissions in the directory of the fasta files. I will now proceed with writing the database files to the DB/ directory in $tmp_path (-tmp)."); if($step==1){print STDERR "$ORANGE Please ensure that you use -tmp=$tmp_path -keep for future analysis.$NC";}print "\n";

          mkdir("$tmp_path/DB");
          if($step==1){$oldkeep=$keep;$keep=1;}
          system("$makedb '$file' -out '$tmp_path/DB/".basename($file).".$blastmode' >\/dev\/null");

          if($?!=0){ $keep=$oldkeep; &Error("The database generation failed once again, please retry with 'sudo' or move the fasta files to a directory with write permissions. If this failes too, then there is something wrong with the fasta files or the version of $blastmode cannot handle the database generation. So please try one of the following:\n- update $blastmode\n- consider another blast algorithm (-p)\n- consider to submitting (mailing) this case to incoming+paulklemm-phd-proteinortho-7278443-issue-\@incoming.gitlab.com.");}}
      }
    }
    unlink('formatdb.log');
  }else { # old blastall
    foreach my $file (@_) {
      if ($file =~ /\s/) {&Error("File name '$file' contains whitespaces. This might lead to undesired effects. If you encounter unusual behaviours, please change the file name!\n");}
      if(!$force && join("",glob("$file.$blastmode*")) ne ""){
        if ($verbose) {print STDERR "The database for '$file' is present and will be used\n";}
      }else{
        if ($verbose) {print STDERR "Building database for '$file'\t(".$gene_counter{$file}." sequences)\n";}
        if ($debug) {print STDERR "$makedb '$file' -out '$file.$blastmode' >\/dev\/null\n";}
        system("$makedb '$file' -n '$file.$blastmode' >\/dev\/null");
        if ($? != 0) {print STDERR ("$ORANGE\n[WARNING]$NC ".$blastmode." failed to create a database. Most likely you don't have write permissions in the directory of the fasta files. I will now proceed with writing the database files to the DB/ directory in $tmp_path (-tmp)."); if($step==1){print STDERR "$ORANGE Please ensure that you use -tmp=$tmp_path -keep for future analysis.$NC";}print "\n";

          mkdir("$tmp_path/DB");
          if($step==1){$oldkeep=$keep;$keep=1;}
          system("$makedb '$file' -n '$tmp_path/DB/".basename($file).".$blastmode' >\/dev\/null");

          if($?!=0){ $keep=$oldkeep; &Error("The database generation failed once again, please retry with 'sudo' or move the fasta files to a directory with write permissions. If this failes too, then there is something wrong with the fasta files or the version of $blastmode cannot handle the database generation. So please try one of the following:\n- update $blastmode\n- consider another blast algorithm (-p)\n- consider to submitting (mailing) this case to incoming+paulklemm-phd-proteinortho-7278443-issue-\@incoming.gitlab.com.");}}
      }
    }
    unlink('formatdb.log');
  }
  ##MARK_FOR_NEW_BLAST_ALGORITHM

}

sub blast {

  my $command = "";

  my $a = $_[0];
  my $b = $_[1];
  my $thread_id = $_[2];
  if( $blastmode eq "mmseqsp" || $blastmode eq "mmseqsn" || $blastmode eq "mmseqs" ){ if(!-d "$tmp_path/mmseqs".$thread_id ){mkdir("$tmp_path/mmseqs".$thread_id);}else{rmdir("$tmp_path/mmseqs".$thread_id);mkdir("$tmp_path/mmseqs".$thread_id);}}

  $a = basename($a);
  $b = basename($b);
  my $bla = "$tmp_path$a.vs.$b.".$blastmode;

  my $blastOptions_diamond=''; #additional option --sensitive (if gene length is larger than 40)
  my $max_genlen = 1;
    if($blastmode eq "diamond"){
      if (index($blastOptions, "--more-sensitive") == -1 && index($blastOptions, "--sensitive") == -1) {
      if( $max_gene_length_diamond{$a} > 40 || $max_gene_length_diamond{$b} > 40 ){ $blastOptions_diamond = "--sensitive" }
    }
  }

  my $printSTDERR="";
  if($verbose != 2){
    $printSTDERR='2>/dev/null';
  }

  my ($fileType) = $a =~ /\.(\w*)$/;

  $a = $_[0]; # get a b again for blast algorithm (now they are not basenamed)
  $b = $_[1];

  if($blastmode !~ m/blat/ && join("",glob("$a.$blastmode*"))eq ""){
    $a = "$tmp_path/DB/".basename($a);
    if(join("",glob("$a.$blastmode*"))eq ""){
      &Error("I did not find the database for ".$_[0].". Did you run --step=1 and maybe removed the databases? Please rerun 'proteinortho --step=1 --force /path/to/fastas' such that the databases can be recreated and then proceed with -step=2 and -step=3.");
    }
  }
  if($blastmode !~ m/blat/ && join("",glob("$b.$blastmode*"))eq ""){
    $b = "$tmp_path/DB/".basename($b);
    if(join("",glob("$b.$blastmode*"))eq ""){
      &Error("I did not find the database for ".$_[1].". Did you run --step=1 and maybe removed the databases? Please rerun 'proteinortho --step=1 --force /path/to/fastas' such that the databases can be recreated and then proceed with -step=2 and -step=3.");
    }
  }

  if    ($blastmode eq "blastp_legacy" || $blastmode eq "blastn_legacy" || $blastmode eq "tblastx_legacy") {lock($threads_per_process); $command = $binpath."blastall -a $threads_per_process -d '$a.$blastmode' -i '$_[1]' -p $blastmode -m8 -e $evalue $blastOptions $printSTDERR";}
  elsif ($blastmode eq "blastp+")   {lock($threads_per_process); $command = $binpath."blastp -num_threads $threads_per_process -db '$a.$blastmode' -query '$_[1]' -evalue $evalue -outfmt 6 $blastOptions $printSTDERR";}
  elsif ($blastmode eq "blastn+")   {lock($threads_per_process); $command = $binpath."blastn -num_threads $threads_per_process -db '$a.$blastmode' -query '$_[1]' -evalue $evalue -outfmt 6 $blastOptions $printSTDERR";}
  elsif ($blastmode eq "tblastx+")  {lock($threads_per_process); $command = $binpath."tblastx -num_threads $threads_per_process -db '$a.$blastmode' -query '$_[1]' -evalue $evalue -outfmt 6 $blastOptions $printSTDERR";}
  elsif ($blastmode eq "blatp")       {lock($threads_per_process); $command = $binpath."blat -prot '$_[0]' '$_[1]' -out=blast8 $bla $blastOptions >\/dev\/null $printSTDERR";}
  elsif ($blastmode eq "blatn")       {lock($threads_per_process); $command = $binpath."blat '$_[0]' '$_[1]' -out=blast8 $bla $blastOptions >\/dev\/null $printSTDERR";}
  elsif ($blastmode eq "blatx")       {lock($threads_per_process); $command = $binpath."blat '$_[0]' '$_[1]' -t=dnax -q=dnax -out=blast8 $bla $blastOptions >\/dev\/null $printSTDERR";}
  #elsif ($blastmode eq "pblatp")      {lock($threads_per_process); $command = $binpath."pblat -prot '$_[0]' '$_[1]' -threads=$threads_per_process -out=blast8 $bla >\/dev\/null 2>&1";}
  #elsif ($blastmode eq "pblatn")      {lock($threads_per_process); $command = $binpath."pblat '$_[0]' '$_[1]' -threads=$threads_per_process -out=blast8 $bla >\/dev\/null 2>&1";}
  #elsif ($blastmode eq "pblatx")      {lock($threads_per_process); $command = $binpath."pblat '$_[0]' '$_[1]' -t=dnax -q=dnax -threads=$threads_per_process -out=blast8 $bla >\/dev\/null 2>&1";}
  elsif ($blastmode eq "diamond" )    {lock($threads_per_process); $command = $binpath."diamond blastp --threads $threads_per_process -d '$a.$blastmode' -q '$_[1]' -a $bla -e $evalue --quiet $blastOptions $blastOptions_diamond >'\/dev\/null' $printSTDERR";}
  elsif ($blastmode eq "topaz" )      {lock($threads_per_process); $command = $binpath."topaz search -T $threads_per_process -p '$a.$blastmode' -f '$_[1]' -E $evalue --blasttab $blastOptions >'$bla' $printSTDERR";}
  elsif ($blastmode eq "ublast")      {lock($threads_per_process); $command = $binpath."usearch -ublast '$_[1]' -db '$a.$blastmode' -threads $threads_per_process -evalue $evalue -blast6out $bla $blastOptions >\/dev\/null $printSTDERR";}
  elsif ($blastmode eq "usearch")     {lock($threads_per_process); $command = $binpath."usearch -usearch_local '$_[1]' -db '$a.$blastmode' -threads $threads_per_process -evalue $evalue -id 0 -blast6out $bla $blastOptions >\/dev\/null $printSTDERR";}
  elsif ($blastmode eq "rapsearch")   {lock($threads_per_process); $command = $binpath."rapsearch -s T -d '$a.$blastmode' -q '$_[1]' -z $threads_per_process -e $evalue -out $bla $blastOptions >\/dev\/null $printSTDERR";}
  elsif ($blastmode eq "lastp")       {lock($threads_per_process); $command = $binpath."lastal -E$evalue -fBlastTab+ -pBL62 -P$threads_per_process '$a.$blastmode' '$_[1]' $blastOptions > $bla $printSTDERR";}
  elsif ($blastmode eq "lastn")       {lock($threads_per_process); $command = $binpath."lastal -E$evalue -fBlastTab+ -P$threads_per_process '$a.$blastmode' '$_[1]' $blastOptions > $bla $printSTDERR";}
  elsif ($blastmode eq "mmseqsn" || $blastmode eq "mmseqsp")       {lock($threads_per_process); $command = $binpath."mmseqs search '$b.$blastmode' '$a.$blastmode' $bla $tmp_path/mmseqs$thread_id --threads $threads_per_process -e $evalue $blastOptions >\/dev\/null $printSTDERR";}
  else  {&Error("This should not happen! Please submit the FASTA file(s) and the parameter vector (above) to incoming+paulklemm-phd-proteinortho-7278443-issue-\@incoming.gitlab.com to help fixing this issue.");}
  ##MARK_FOR_NEW_BLAST_ALGORITHM

  # File does not exists yet or I am forced to rewrite it
  if (!(-e $bla) || $force) {

    if (-e $bla && $force) { system("rm $bla"); }

    if ($debug || $verbose==2) {print STDERR "$command\n";}                     # 5.16
    if ($blastmode eq "diamond") {
      my $command2 = $binpath."diamond view --threads $threads_per_process --quiet -a $bla.daa -o $bla.tmp >/dev/null $printSTDERR";
      system("$command && $command2");   # run diamond and presort
      if ($? != 0) {
        &Error($blastmode." failed.\nThe most likely  errorsources of this are:\n- no space left on device error.\n- outdated $blastmode, please update $blastmode or consider another -p algorithm.\n- the databases are missing. Maybe you ran --step=1 and removed the databases afterwards? Please rerun 'proteinortho --step=1 --force /path/to/fastas'\n- maybe the fasta files are mixed nucleotide and aminoacid sequences or just not suited for $blastmode? (For example diamond only processes protein sequences) Try 'proteinortho --step=1 --check --force /path/to/fastas'.");
      }
      if($debug eq "test_sort"){while (<"$bla.tmp">){if ($_ =~ /[^\t]+([,])[^\t]+[eE]/) {print "found forbidden symbol '$1' at $_ in $bla\n";&reset_locale();die;}}}
      system("rm $bla.daa");
    }
    elsif ($blastmode eq "rapsearch") {
      system("$command");
      if ($? != 0) {
          &Error($blastmode." failed.\nThe most likely sources of this error are:\n- no space left on device error.\n- outdated $blastmode, please update $blastmode or consider another -p algorithm.\n- the databases are missing. Maybe you ran --step=1 and removed the databases afterwards? Please rerun 'proteinortho --step=1 --force /path/to/fastas'\n- maybe the fasta files are mixed nucleotide and aminoacid sequences or just not suited for $blastmode? (For example diamond only processes protein sequences) Try 'proteinortho --step=1 --check --force /path/to/fastas'.");
        }
        #
        # NOTE -s f does not work everytime, therefore here is a conversion of -s t to f (ln -> evalues)
        #
      open(OUTM,">>$bla.m82");
      open(INM,"<$bla.m8");
      while (<INM>){
        if(length($_)==0 || substr($_,0,1)eq"#"){next;}my @arr=split("\t",$_); print OUTM $arr[0]."\t".$arr[1]."\t".$arr[2]."\t".$arr[3]."\t".$arr[4]."\t".$arr[5]."\t".$arr[6]."\t".$arr[7]."\t".$arr[8]."\t".$arr[9]."\t".exp($arr[10])."\t".$arr[11];
      }
      close(OUTM);
      close(INM);
      if($debug eq "test_sort"){while (<"$bla.m8">){if ($_ =~ /[^\t]+([,])[^\t]+[eE]/) {print "found forbidden symbol '$1' at $_ in $bla.m8\n";&reset_locale();die;}}}
      system("rm $bla.aln"); #remove aln file
      system("touch $bla.m8"); # rapsearch does not produces an output file if there are no hits found (<evalue threshold).
      system("tail -n +6 $bla.m82 >'$bla.tmp'"); # remove head/comment lines of rapsearch
      system("rm $bla.m8");
      system("rm $bla.m82");
    }
    elsif ($blastmode eq "usearch" || $blastmode eq "ublast") {
      system("$command");
      if ($? != 0) {
          &Error($blastmode." failed.\nThe most likely sources of this error are:\n- no space left on device error.\n- outdated $blastmode, please update $blastmode or consider another -p algorithm.\n- the databases are missing. Maybe you ran --step=1 and removed the databases afterwards? Please rerun 'proteinortho --step=1 --force /path/to/fastas'\n- maybe the fasta files are mixed nucleotide and aminoacid sequences or just not suited for $blastmode? (For example diamond only processes protein sequences) Try 'proteinortho --step=1 --check --force /path/to/fastas'.");
        }
      if($debug eq "test_sort"){while (<"$bla">){if ($_ =~ /[^\t]+([,])[^\t]+[eE]/) {print "found forbidden symbol '$1' at $_ in $bla\n";&reset_locale();die;}}}
      #system("perl $po_path/proteinortho_formatUsearch.pl $bla >'$bla.tmp_format'"); # problem with ublast/usearch: gene names include the description.
      #system("sort $bla.tmp_format -k12,12rg >'$bla.tmp'");
      system("perl $po_path/proteinortho_formatUsearch.pl $bla >'$bla.tmp'"); # problem with ublast/usearch: gene names include the description.
      #system("rm $bla.tmp_format");
    }
    elsif ($blastmode eq "topaz") {
      system("$command");
      if ($? != 0) {
          &Error($blastmode." failed.\nThe most likely sources of this error are:\n- no space left on device error.\n- outdated $blastmode, please update $blastmode or consider another -p algorithm.\n- the databases are missing. Maybe you ran --step=1 and removed the databases afterwards? Please rerun 'proteinortho --step=1 --force /path/to/fastas'\n- maybe the fasta files are mixed nucleotide and aminoacid sequences or just not suited for $blastmode? (For example diamond only processes protein sequences) Try 'proteinortho --step=1 --check --force /path/to/fastas'.");
        }
      if($debug eq "test_sort"){while (<"$bla">){if ($_ =~ /[^\t]+([,])[^\t]+[eE]/) {print "found forbidden symbol '$1' at $_ in $bla\n";&reset_locale();die;}}}
      #system("tail -n +2 $bla > $bla.nofirstline && mv $bla.nofirstline $bla && sort $bla -k12,12rg >'$bla.tmp'");
          system("tail -n +2 $bla > $bla.tmp");
      unlink("timing.txt");
    }
    elsif ($blastmode eq "lastp" || $blastmode eq "lastn") {
      system("$command");
      if ($? != 0) {
          &Error($blastmode." failed.\nThe most likely sources of this error are:\n- no space left on device error.\n- outdated $blastmode, please update $blastmode or consider another -p algorithm.\n- the databases are missing. Maybe you ran --step=1 and removed the databases afterwards? Please rerun 'proteinortho --step=1 --force /path/to/fastas'\n- maybe the fasta files are mixed nucleotide and aminoacid sequences or just not suited for $blastmode? (For example diamond only processes protein sequences) Try 'proteinortho --step=1 --check --force /path/to/fastas'.");
        }
      if($debug eq "test_sort"){while (<"$bla">){if ($_ =~ /[^\t]+([,])[^\t]+[eE]/) {print "found forbidden symbol '$1' at $_ in $bla\n";&reset_locale();die;}}}
      #system("sort $bla -k12,12rg > $bla.tmp");
          system("mv $bla $bla.tmp");
    }
    elsif ($blastmode =~ m/.*blat.*/) {
      system("$command");
      if ($? != 0) {
          &Error($blastmode." failed.\nThe most likely sources of this error are:\n- no space left on device error.\n- outdated $blastmode, please update $blastmode or consider another -p algorithm.\n- the databases are missing. Maybe you ran --step=1 and removed the databases afterwards? Please rerun 'proteinortho --step=1 --force /path/to/fastas'\n- maybe the fasta files are mixed nucleotide and aminoacid sequences or just not suited for $blastmode? (For example diamond only processes protein sequences) Try 'proteinortho --step=1 --check --force /path/to/fastas'.");
        }
      if($debug eq "test_sort"){while (<"$bla">){if ($_ =~ /[^\t]+([,])[^\t]+[eE]/) {print "found forbidden symbol '$1' at $_ in $bla\n";&reset_locale();die;}}}
      #system('awk -F\'\t\' \'{if($11<'.$evalue.')print $0}\' '.$bla.' > '.$bla.'.unsorted');
      #system("sort $bla.unsorted -k12,12rg > $bla.tmp");
      system('awk -F\'\t\' \'{if($11<'.$evalue.')print $0}\' '.$bla.' > '.$bla.'.tmp');
      #unlink("$bla.unsorted");
    }
    elsif ($blastmode eq "mmseqsn" || $blastmode eq "mmseqsp") {

      system("$command");
      if ($? != 0) {
        &Error($blastmode." failed.\nThe most likely sources of this error are:\n- no space left on device error.\n- outdated $blastmode, please update $blastmode or consider another -p algorithm.\n- the databases are missing. Maybe you ran --step=1 and removed the databases afterwards? Please rerun 'proteinortho --step=1 --force /path/to/fastas'\n- maybe the fasta files are mixed nucleotide and aminoacid sequences or just not suited for $blastmode? (For example diamond only processes protein sequences) Try 'proteinortho --step=1 --check --force /path/to/fastas'.");
      }
      system($binpath."mmseqs convertalis '$b.$blastmode' '$a.$blastmode' $bla $bla.tmp2 >\/dev\/null 2>\/dev\/null");

      if($debug eq "test_sort"){while (<"$bla.tmp">){if ($_ =~ /[^\t]+([,])[^\t]+[eE]/) {print "found forbidden symbol '$1' at $_ in $bla.tmp\n";&reset_locale();die;}}}
      system("rm $bla.*index*");
      system('sed -re \'s/([0-9\.]+)(E)(\-|\+)([0-9\.]+)/\1e\3\4/g\' '.$bla.'.tmp2 >'.$bla.'.tmp 2>\/dev\/null');

      unlink("$bla.dbtype");
      unlink("$bla.tmp2");
      unlink("$bla.index");

    }
    else {
    #system("$command | sort -k12,12rg >'$bla.tmp'");   # run blast and presort (speeds up best alignment search but is NOT mandatory)  # 5.16
      system("$command >'$bla.tmp'");
      if ($? != 0) {
        &Error($blastmode." failed.\nThe most likely sources of this error are:\n- no space left on device error.\n- outdated $blastmode, please update $blastmode or consider another -p algorithm.\n- the databases are missing. Maybe you ran --step=1 and removed the databases afterwards? Please rerun 'proteinortho --step=1 --force /path/to/fastas'\n- maybe the fasta files are mixed nucleotide and aminoacid sequences or just not suited for $blastmode? (For example diamond only processes protein sequences) Try 'proteinortho --step=1 --check --force /path/to/fastas'.");
      }
     if($debug eq "test_sort"){while (<"$bla.tmp">){if ($_ =~ /[^\t]+([,])[^\t]+[eE]/) {print "found forbidden symbol '$1' at $_ in $bla.tmp\n";&reset_locale();die;}}}
    }
  ##MARK_FOR_NEW_BLAST_ALGORITHM

    system("mv '$bla.tmp' '$bla'");       # move when done, aids when job fails (no half bla files)     # 5.16
    if ($? != 0) {
        &Error($blastmode." failed.\nThe most likely sources of this error are:\n- no space left on device error.\n- outdated $blastmode, please update $blastmode or consider another -p algorithm.\n- the databases are missing. Maybe you ran --step=1 and removed the databases afterwards? Please rerun 'proteinortho --step=1 --force /path/to/fastas'\n- maybe the fasta files are mixed nucleotide and aminoacid sequences or just not suited for $blastmode? (For example diamond only processes protein sequences) Try 'proteinortho --step=1 --check --force /path/to/fastas'.");
    }
    my @data = &readFile($bla);
    #unless ($keep) {unlink("$bla");}       # delete tmp file
    return \@data;
  }
  # Otherwise, use existing data
  if ($verbose) {print STDERR "\nNote: '$bla' exists, using pre-calculated data\n";}
  my @data = &readFile($bla);
  return \@data;
}

sub readFile {
  open(FILE,"<$_[0]") || &Error("Error, could not open file $_[0]: $!");
  my @data = <FILE>;
  close(FILE);
  chomp @data;
  return @data;
}

sub check_bins {

  if($blastmode eq "blast"){print STDERR "$ORANGE\n[WARNING]$NC There is no blastmode -p=$blastmode. I will choose the protein search blastp+ and continue...$NC"; $blastmode="blastp+";}
  if($blastmode eq "last"){print STDERR "$ORANGE\n[WARNING]$NC There is no blastmode -p=$blastmode. I will choose the protein search lastp and continue...$NC";$blastmode.="p";}
  if($blastmode eq "mmseqs"){print STDERR "$ORANGE\n[WARNING]$NC There is no blastmode -p=$blastmode. I will choose the protein search mmseqsp and continue...$NC";$blastmode.="p";}
  if($blastmode =~ m/usearch.+/){print STDERR "$ORANGE\n[WARNING]$NC There is no blastmode -p=$blastmode. I will choose the protein search -p=usearch and continue...$NC";$blastmode="usearch";}
  if($blastmode =~ m/ublast.+/){print STDERR "$ORANGE\n[WARNING]$NC There is no blastmode -p=$blastmode. I will choose the protein search -p=ublast and continue...$NC";$blastmode="ublast";}
  if($blastmode =~ m/rapsearch.+/){print STDERR "$ORANGE\n[WARNING]$NC There is no blastmode -p=$blastmode. I will choose the protein search -p=rapsearch and continue...$NC";$blastmode="rapsearch";}
  ##MARK_FOR_NEW_BLAST_ALGORITHM

  if ($blastmode eq "blastp+" || $blastmode eq "blastn+" || $blastmode eq "tblastx+") {
    my $tmp = $blastmode;
    $tmp =~ s/\+//g;
    my $cmd = $binpath."$tmp -h";
    my $out = qx($cmd 2>&1);
    if (defined($out) && $out =~ /DESCRIPTION.*?\n\s*(.+)\n/) {
      my @version = split(/\s+/,$1);
      my $versionnumber = pop @version;
      # Commands
      if  ($blastmode eq "blastp+") {$makedb = $binpath."makeblastdb -dbtype prot -in";}
      elsif   ($blastmode eq "blastn+" || $blastmode eq "tblastx+") {$makedb = $binpath."makeblastdb -dbtype nucl -in";}
      else  {&Error("This should not happen! Please submit the FASTA file(s) and the parameter vector (above) to incoming+paulklemm-phd-proteinortho-7278443-issue-\@incoming.gitlab.com to help fixing this issue.");}

      if($verbose){print STDERR "Detected '$blastmode' version $versionnumber\n";}
      return;
    }
    &Error("Failed to detect '$blastmode'! Tried to call '$binpath/$tmp'.\nPlease install $blastmode in $binpath (or specify another binpath with -binpath=/home/...)");
  }elsif ($blastmode eq "blast+") {
      &Error("Please call -p=blastp+ for protein datasets and -p=blastn+ for nucleotide datasets (and -p=tblastx+ for translated query/db).");
  }
  elsif ($blastmode eq "blastp_legacy" || $blastmode eq "blastn_legacy" || $blastmode eq "tblastx_legacy") {
    my $cmd = $binpath."blastall";
    my @blastv = qx($cmd 2>&1);
    foreach (@blastv) {
      $_=~s/[\r\n]+$//;
      if ($_ =~ /blastall.+?([^\s]+)/) {
        my $versionnumber = $1;
        if  ($blastmode eq "blastp_legacy") {$makedb = $binpath."formatdb -p T -o F -i";}
        elsif   ($blastmode eq "blastn") {$makedb = $binpath."formatdb -p F -o F -i";}
        elsif   ($blastmode eq "tblastx") {$makedb = $binpath."formatdb -p F -o F -i";}
        else  {&Error("This should not happen! Please submit the FASTA file(s) and the parameter vector (above to incoming+paulklemm-phd-proteinortho-7278443-issue-\@incoming.gitlab.com to help fixing this issue.");}

        if($verbose){print STDERR "Detected '$blastmode' version $versionnumber\n";}
        return;
      }
    }
    &Error("Failed to detect '$blastmode'! Tried to call '$binpath/blastall'.\nPlease install $blastmode in $binpath (or specify another binpath with -binpath=/home/...)");
  }elsif ($blastmode eq "blast") {
    &Error("Please call -p=blastp_legacy for protein datasets and -p=blastn for nucleotide datasets.");
  }
  elsif ($blastmode eq "topaz") {
    my $cmd = $binpath."topaz -h";
    my @topazv = qx($cmd 2>&1);
    foreach (@topazv) {
      $_=~s/[\r\n]+$//;
      if ($_ =~ /usage: TOPAZ(.+)/) {
        $makedb = $binpath."topaz";
        if($verbose){print STDERR "Detected '$blastmode'\n";}
        return;
      }
    }
    print STDERR ("\n!!!$ORANGE\n[WARNING]$NC Failed to detect '$blastmode'.$NC\nPlease install $blastmode in $binpath (or specify another binpath with -binpath=/home/...)\nI try now -p=blastp+ as fallback.\n");
    print STDERR "\nPress 'strg+c' to prevent me from proceeding or wait 10 seconds to continue...\n!!!\n";
     sleep 10;
    print STDERR "\nWell then, proceeding...\n\n";
    $blastmode="blastp+";
    goto RESTART;
    &Error("Failed to detect '$blastmode'! Tried to call '$binpath/$blastmode'\nPlease install $blastmode in $binpath (or specify another binpath with -binpath=/home/...).");
  }
  elsif ($blastmode =~ m/^blat[np]/) {
    my $cmd = $binpath."blat";
    my $out = qx($cmd 2>&1);
    if (defined($out) && $out =~ /.*BLAT.*v\. ([\d\.]*)/) {
      my $versionnumber = $1;
      if($verbose){print STDERR "Detected '$blastmode' version $versionnumber\n";}
      $makedb="";
      return;
    }else{
      print STDERR ("\n!!!$ORANGE\n[WARNING]$NC Failed to detect '$blastmode'.$NC\nPlease install $blastmode in $binpath (or specify another binpath with -binpath=/home/...)\nI try now -p=blastp+ as fallback.\n");
      print STDERR "\nPress 'strg+c' to prevent me from proceeding or wait 10 seconds to continue...\n!!!\n";
       sleep 10;
      print STDERR "\nWell then, proceeding...\n\n";
      $blastmode="blastp+";
      goto RESTART;
    }
    &Error("Failed to detect '$blastmode'! Tried to call '$binpath/$blastmode'\nPlease install $blastmode in $binpath (or specify another binpath with -binpath=/home/...).");
  }elsif ($blastmode eq "blat") {
    &Error("Please call -p=blatp for protein datasets and -p=blatn for nucleotide datasets.");
  }
  # elsif ($blastmode =~ m/^pblat.*/) {
  #   my $cmd = $binpath."pblat";
  #   my $out = qx($cmd);
  #   if (defined($out) && $out =~ /.*BLAT.*v\. ([\d\.]*)/) {
  #     my $versionnumber = $1;
  #     if($verbose){print STDERR "Detected '$blastmode' version $versionnumber\n";}
  #     $makedb="";
  #     return;
  #   }
  #   &Error("Failed to detect '$blastmode'! Tried to call '$binpath/$blastmode'\nPlease install $blastmode in $binpath (or specify another binpath with -binpath=/home/...).");
  # }
  elsif ($blastmode eq "rapsearch") {
    my $cmd = $binpath."$blastmode -h";
    my $out = qx($cmd 2>&1);
    if (defined($out) && $out =~ /rapsearch\sv([\d\.]*)/) {
      my @version = split(/\s+/,$1);
      my $versionnumber = pop @version;
      $makedb = $binpath."prerapsearch";
     if($verbose){print STDERR "Detected '$blastmode' version $versionnumber\n";}
      return;
    }else{
      print STDERR ("\n!!!$ORANGE\n[WARNING]$NC Failed to detect '$blastmode'.$NC\nPlease install $blastmode in $binpath (or specify another binpath with -binpath=/home/...)\nI try now -p=blastp+ as fallback.\n");
      print STDERR "\nPress 'strg+c' to prevent me from proceeding or wait 10 seconds to continue...\n!!!\n";
       sleep 10;
      print STDERR "\nWell then, proceeding...\n\n";
      $blastmode="blastp+";
      goto RESTART;
    }
    &Error("Failed to detect '$blastmode'! Tried to call '$binpath/$blastmode'\nPlease install $blastmode in $binpath (or specify another binpath with -binpath=/home/...).");
  }
  elsif ($blastmode eq "mmseqsn" || $blastmode eq "mmseqsp") {
    my $cmd = $binpath."mmseqs version";
    my $out = qx($cmd 2>&1);
    if (defined($out) && $out =~ /([a-zA-Z0-9]+)/) {
      my @version = split(/\s+/,$1);
      my $versionnumber = pop @version;
      $makedb = $binpath."mmseqs createdb";
      if($verbose){print STDERR "Detected '$blastmode' version $versionnumber\n";}
      return;
    }else{
      print STDERR ("\n!!!$ORANGE\n[WARNING]$NC Failed to detect '$blastmode'.$NC\nPlease install $blastmode in $binpath (or specify another binpath with -binpath=/home/...)\nI try now -p=blastn+ as fallback.");
      print STDERR "\nPress 'strg+c' to prevent me from proceeding or wait 10 seconds to continue...\n!!!\n";
       sleep 10;
      print STDERR "\nWell then, proceeding...\n\n";
      $blastmode="blastp+";
      goto RESTART;
    }
    &Error("Failed to detect '$blastmode'! Tried to call '$binpath/$blastmode'\nPlease install $blastmode in $binpath (or specify another binpath with -binpath=/home/...).");
  }elsif ($blastmode eq "mmseqs") {
    &Error("Please call -p=mmseqsp for protein datasets and -p=mmseqsn for nucleotide datasets.");
  }
  elsif ($blastmode eq "diamond") {
    my $cmd = $binpath."diamond version 2>/dev/null";
    my $out = qx($cmd 2>&1);
    if (defined($out) && $out =~ /diamond\sversion\s(.+)\n/) {
      my @version = split(/\s+/,$1);
      my $versionnumber = pop @version;
      $makedb = $binpath."diamond makedb --in";
      if($verbose){print STDERR "Detected '$blastmode' version $versionnumber\n";}
      if($versionnumber =~ m/^0\.9\.(\d+)/){ if($1 < 29){
        print STDERR "\n!!!! \nWARNING '$blastmode' version $versionnumber has a known bug that incorrectly computes the length of an alignment, thus the coverage threshold can produce wrong results leading in false negatives. See https://gitlab.com/paulklemm_PHD/proteinortho/issues/24 for more details.\n\n >>> Please update diamond to 0.9.29 or higher <<<\n";
        print STDERR "\nPress 'strg+c' to prevent me from proceeding or wait 10 seconds to continue...\n!!!\n";
        sleep 10;
        print STDERR "\nWell then, proceeding...\n\n";} }
      return;
    }else{
      print STDERR ("\n!!!$ORANGE\n[WARNING]$NC Failed to detect '$blastmode'.$NC\nPlease install $blastmode in $binpath (or specify another binpath with -binpath=/home/...)\nI try now -p=blastp+ as fallback.\n");
      print STDERR "\nPress 'strg+c' to prevent me from proceeding or wait 10 seconds to continue...\n!!!\n";
       sleep 10;
      print STDERR "\nWell then, proceeding...\n\n";
      $blastmode="blastp+";
      goto RESTART;
    }
    &Error("Failed to detect '$blastmode'! Tried to call '$binpath/$blastmode'\nPlease install $blastmode in $binpath (or specify another binpath with -binpath=/home/...).");
  }
  elsif ($blastmode eq "ublast") {
    my $cmd = $binpath."usearch -version";
    my $out = qx($cmd 2>&1);
    if (defined($out) && $out =~ /usearch\sv(.+)\n/) {
      my @version = split(/\s+/,$1);
      my $versionnumber = pop @version;
      $makedb = $binpath."usearch -makeudb_ublast";
      if($verbose){print STDERR "Detected '$blastmode' version $versionnumber\n";}
      return;
    }else{
      print STDERR ("\n!!!$ORANGE\n[WARNING]$NC Failed to detect '$blastmode'.$NC\nPlease install $blastmode in $binpath (or specify another binpath with -binpath=/home/...)\nI try now -p=blastp+ as fallback.\n");
      print STDERR "\nPress 'strg+c' to prevent me from proceeding or wait 10 seconds to continue...\n!!!\n";
       sleep 10;
      print STDERR "\nWell then, proceeding...\n\n";
      $blastmode="blastp+";
      goto RESTART;
    }
    &Error("Failed to detect '$blastmode'! Tried to call '$binpath/$blastmode'\nPlease install $blastmode in $binpath (or specify another binpath with -binpath=/home/...).");
  }
  elsif ($blastmode eq "usearch") {
    my $cmd = $binpath."$blastmode -version";
    my $out = qx($cmd 2>&1);
    if (defined($out) && $out =~ /usearch\sv(.+)\n/) {
      my @version = split(/\s+/,$1);
      my $versionnumber = pop @version;
      $makedb = $binpath."usearch -makeudb_usearch";
      if($verbose){print STDERR "Detected '$blastmode' version $versionnumber\n";}
      return;
    }else{
      print STDERR ("\n!!!$ORANGE\n[WARNING]$NC Failed to detect '$blastmode'.$NC\nPlease install $blastmode in $binpath (or specify another binpath with -binpath=/home/...)\nI try now -p=blastp+ as fallback.\n");
      print STDERR "\nPress 'strg+c' to prevent me from proceeding or wait 10 seconds to continue...\n!!!\n";
       sleep 10;
      print STDERR "\nWell then, proceeding...\n\n";
      $blastmode="blastp+";
      goto RESTART;
    }
    &Error("Failed to detect '$blastmode'! Tried to call '$binpath/$blastmode'\nPlease install $blastmode in $binpath (or specify another binpath with -binpath=/home/...).");
  }
  elsif ($blastmode eq "lastp" || $blastmode eq "lastn") {
    my $cmd = $binpath."lastal -V";
    my $out = qx($cmd 2>&1);
    if (defined($out) && $out =~ /lastal\s(.+)\n/) {
      my @version = split(/\s+/,$1);
      my $versionnumber = pop @version;
      $makedb = $binpath."lastdb";
      if($verbose){print STDERR "Detected '$blastmode' version $versionnumber\n";}
      return;
    }else{
      print STDERR ("\n!!!$ORANGE\n[WARNING]$NC Failed to detect '$blastmode'.$NC\nPlease install $blastmode in $binpath (or specify another binpath with -binpath=/home/...)\nI try now -p=blastp+ as fallback.\n");
      print STDERR "\nPress 'strg+c' to prevent me from proceeding or wait 10 seconds to continue...\n!!!\n";
       sleep 10;
      print STDERR "\nWell then, proceeding...\n\n";
      $blastmode="blastp+";
      goto RESTART;
    }
    &Error("Failed to detect '$blastmode'! Tried to call '$binpath/lastal'.$NC\nPlease install $blastmode in $binpath (or specify another binpath with -binpath=/home/...)");
  }elsif ($blastmode eq "last" || $blastmode eq "lastal") {
    &Error("Please call -p=lastp for protein datasets and -p=lastn for nucleotide datasets.");
  }
  ##MARK_FOR_NEW_BLAST_ALGORITHM

  &Error("Blast mode '$blastmode' is not supported. Feel free to ask the author to add it.");
}

# Check plausibility of files
sub check_files {
  if ( ( scalar(@files) == 0 || (scalar(@files) == 1 && $selfblast==0) ) && $step != 3)   {&print_usage; &Error("I need at least two files to compare something!");}
  if($verbose){print STDERR "Checking input files";if($checkfasta){print STDERR " very carefully (-check).\n"}else{print STDERR ".\n";}}
  foreach my $file (@files) {
    if ($verbose) {print STDERR "Checking $file... ";}
    &read_details($file,1);
    if ($verbose) {print STDERR "ok\n";}
  }
}

sub convertNCBI {
  my $long_id = shift;
  $long_id =~ s/\|$//g;
  my @tmp = split(/\|/,$long_id); # take the last column for NCBI format like patterns (e.g. gi|158333234|ref|YP_001514406.1|)
  return pop(@tmp);
}

sub read_details {
  my %ids;        # local test for duplicated IDs
  my %genes;
  my $file = shift;
  my $test = 0;
  my $lastgenename="";
  my $cur_gene_is_valid=1;

  my $ATCGNoccurences=0; # for checking if faa or fna file -> minimum 50% ATCGN content for fna and maximal 80% for faa files
  my $genelength=0; # for checking if faa or fna file -> minimum genelength for checking is 50

  if (defined($_[0])) {$test = 1;}  # if no ID Hash is give, we do not want to test but to fetch the gff data

  my $did_found_emptyline=0;
# print STDERR "TEST: $test\n";
  if (!-e $file)    {&Error("File '$file' not found!");}
  open(FASTA,"<$file") || &Error("Could not open '$file': $!");
  while (<FASTA>) {
    if ($_ =~/^$/){#if($_ eq "" || $_ eq "\n" || $_ eq "\r" || $_ eq "\r\n" || $_ eq "\n\r"){
      $did_found_emptyline=1;
    }

    if ($_ =~ />/) { #head line -> gene name and description

      if(!$force && $checkfasta && exists($allowedAlphabet->{$blastmode}) && $cur_gene_is_valid<1){last;}
      if(!$force && $checkfasta && exists($allowedAlphabet->{$blastmode}) && ($genelength>50 && ( ($allowedAlphabet->{$blastmode} eq "n" && $ATCGNoccurences/$genelength < 0.5) || ($allowedAlphabet->{$blastmode} eq "a" && $ATCGNoccurences/$genelength > 0.8)))){$cur_gene_is_valid= -1;last;}

      $gene_counter{$file}++;
      $_ =~ s/[\r\n]+$//;#chomp only removes last \n newline, now also \r are removed and all occurences
      $_ =~ s/^>//;
      $_ =~ s/\s.*//;
      $lastgenename=$_;
      if ($test) {
        if (defined($ids{$_}))  {&Error("Gene ID '$_' is defined at least twice in $file");}
        $ids{$_} = $file;
      }
      if ($synteny) {
        my $short_id = &convertNCBI($_);
        $genes{$short_id} = 1;
      }

      $cur_gene_is_valid=1;
      $ATCGNoccurences=0;
      $genelength=0;

    }else{

      $_ =~ s/[\r\n]+$//;#chomp only removes last \n newline, now also \r are removed and all occurences
      if(!$force && $checkfasta&& exists($allowedAlphabet->{$blastmode})){ #test if the current gene is valid or not (for blastalgorithms that require either amino or nucleotide sequences only)
          if($allowedAlphabet->{$blastmode} eq "a"){ #"a"= aminoacid sequence
            if( $_ =~ /[^$aminoAlphabet]/){
              $cur_gene_is_valid=0;
            }
          }elsif($_ =~ /[^$nucleotideAlphabet]/){
              $cur_gene_is_valid=0;
          }
          $ATCGNoccurences += ($_ =~ tr/AaTtUuGgCcNn//);
          $genelength += length($_);
        }

      my($filename, $dir, $ext) = fileparse($file);
      if($blastmode eq "diamond"){ # test for -sensitive option (gene with length >40)
        if(exists $max_gene_length_diamond{$filename}){
          if(length($_)>$max_gene_length_diamond{$filename}){
            $max_gene_length_diamond{$filename}=length($_)+0;
          }
        }else{
          $max_gene_length_diamond{$filename}=length($_)+0;
        }
      }
     }
  }

  if($did_found_emptyline){
    print STDERR ("$ORANGE [WARNING]$NC Found empty line in $file, removing it with perl -lne.$NC");
    system('perl -lne \'if($_ ne ""){print "$_";}\' '.$file.' >'.$file.'.tmp');
    system('mv '.$file.'.tmp '.$file);
    if($step==2){
      print STDERR ("$ORANGE [WARNING]$NC Restarting the indices generation.$NC");
      my @arr=($file);
      &generate_indices(@arr);
    }
  }

  if(!$force && $checkfasta && $cur_gene_is_valid==1 && exists($allowedAlphabet->{$blastmode}) && ($genelength>50 && ( ($allowedAlphabet->{$blastmode} eq "n" && $ATCGNoccurences/$genelength < 0.5) || ($allowedAlphabet->{$blastmode} eq "a" && $ATCGNoccurences/$genelength > 0.8)))){$cur_gene_is_valid= -1;}
  if(!$force && $checkfasta && exists($allowedAlphabet->{$blastmode}) ){

    if($allowedAlphabet->{$blastmode} eq "n" && $cur_gene_is_valid<1 ){

      if($cur_gene_is_valid==-1){
       print STDERR ("\n$ORANGE [WARNING]$NC The occurences of ATCGN is less than 50% of in input fasta file '".$file."' in gene '".$lastgenename."'. $blastmode expects nucleotide characters...$NC");
      }else{
        print STDERR ("\n$ORANGE [WARNING]$NC Found forbidden non-nucleotide character in input fasta file '".$file."' in gene '".$lastgenename."'. $blastmode expects nucleotide characters...$NC");
      }
      if( exists($blastmode_pendant->{$blastmode}) && $restart_counter==0 && $step <2){ # only for step = 0 and step 1 you can do a rerun else the DB are missing
        $blastmode = $blastmode_pendant->{$blastmode};
        print STDERR ("\n!!!\n[WARNING]$NC Switching now to $blastmode and restarting...\n");
      print STDERR "\nPress 'strg+c' to prevent me from proceeding or wait 10 seconds to continue...\n!!!\n";
       sleep 10;
      print STDERR "\nWell then, proceeding...\n\n";
        goto RESTART;
      }

      &Error("\nThe algorithm (-p=$blastmode) does not support the given input files (use --force to skip this behaviour)...");

    }elsif($allowedAlphabet->{$blastmode} eq "a" && $cur_gene_is_valid<1 ){

      if($cur_gene_is_valid==-1){
        print STDERR ("\n$ORANGE [WARNING]$NC The occurences of ATCGN is greater than 80% of '".$file."' in gene '".$lastgenename."'. $blastmode expects aminoacid characters...$NC");
      }else{
        print STDERR ("\$ORANGE [WARNING]$NC Found forbidden non-aminoacid character in input fasta file '$file' in gene '$lastgenename'. $blastmode expects aminoacid characters$NC");
      }

      if(exists($blastmode_pendant->{$blastmode}) && $restart_counter==0 && $step <2){ # only for step = 0 and step 1 you can do a rerun else the DB are missing
        $blastmode = $blastmode_pendant->{$blastmode};
        print STDERR ("\n!!!\n[WARNING]$NC Switching now to $blastmode and restarting...\n");
      print STDERR "\nPress 'strg+c' to prevent me from proceeding or wait 10 seconds to continue...\n!!!\n";
       sleep 10;
      print STDERR "\nWell then, proceeding...\n\n";
        goto RESTART;
      }

      &Error("\nThe algorithm (-p=$blastmode) does not support the given input files (use --force to skip this behaviour)...");
    }
  }
  close(FASTA);

  unless ($synteny) {return;}

  my %coordinates;
  if ($verbose && $test) {print STDERR "$file\t".scalar(keys %genes)." genes\t";}
  my $gff = &gff4fasta($file);
  open(GFF,"<$gff") || &Error("Could not open '$gff': $!");
  while (<GFF>) {
    if ($_ =~ /^##FASTA/) {last;} # deal with prokka gffs, thx 2 Ben Woodcroft
    if ($_ =~ /^#/) {next;}
    # e.g. NC_009925.1  RefSeq  CDS 9275  10096 . - 0 ID=cds8;Name=YP_001514414.1;Parent=gene9;Dbxref=Genbank:YP_001514414.1,GeneID:5678848;gbkey=CDS;product=signal peptide peptidase SppA;protein_id=YP_001514414.1;transl_table=11
    my @col = split(/\t+/,$_);
    if ($col[2] ne "CDS") {next;}
    if ($col[8] =~ /Name=([^;]+)/i && defined($genes{$1})) {
      delete $genes{$1};
#     if (!$test) {$coordinates{$1} = "$col[0]\t$col[6]\t$col[3]";} # store
      if (!$test && $col[6] eq "+") {$coordinates{$1} = "$col[0]\t$col[6]\t$col[3]";} # store
      if (!$test && $col[6] eq "-") {$coordinates{$1} = "$col[0]\t$col[6]\t$col[4]";} # store
    }
    elsif ($col[8] =~ /ID=([^;]+)/i && defined($genes{$1})) {
      delete $genes{$1};
#     if (!$test) {$coordinates{$1} = "$col[0]\t$col[6]\t$col[3]";} # store
      if (!$test && $col[6] eq "+") {$coordinates{$1} = "$col[0]\t$col[6]\t$col[3]";} # store
      if (!$test && $col[6] eq "-") {$coordinates{$1} = "$col[0]\t$col[6]\t$col[4]";} # store
    }
  }
  close(GFF);

  if (scalar(keys %genes)) {
    my @tmp = keys %genes;
    &Error("No coordinate found for these gene(s): ".join(",",@tmp)."\nusing '$gff' and '$file'");
  }

  if (!$test) {return \%coordinates;}   # store
}

sub Error {
  $debug=1;
  if($_[0] ne "I need at least two files to compare something!"){print STDERR "\n";print STDERR &get_parameter;}

  print STDERR "\n\n$RED"."[Error]$NC $ORANGE ".$_[0]." $NC \n\n";

  if($_[0] ne "I need at least two files to compare something!"){print STDERR "(If you cannot solve this error, please send a report to incoming+paulklemm-phd-proteinortho-7278443-issue-\@incoming.gitlab.com including the parameter-vector above or visit https://gitlab.com/paulklemm_PHD/proteinortho/wikis/Error%20Codes for more help. Further more all mails to lechner\@staff.uni-marburg.de are welcome)\n\n\n";}

  &reset_locale();
  if (!$keep && $tmp_path =~ m/\/proteinortho_cache_[^\/]+\d*\/$/ && $step!=1 ){system("rm -r $tmp_path >/dev/null 2>&1");}
  exit 1;
}

# Remove .fasta/.faa etc. and change it to .gff
## Update 6.001: Test for additional naming schemes
sub gff4fasta {
  my $gff = shift;
  $gff =~ s/\.[^.]+$/.gff/;
  if (-e $gff) {return $gff;}
  if (-e $gff."3") {return $gff."3";}

  my $ncbi_gff = $gff;
  if ($ncbi_gff =~ s/_cds_from//) {
    if (-e $ncbi_gff) {return $ncbi_gff;}
  }
  $ncbi_gff = $gff;
  if ($ncbi_gff =~ s/_protein/_genomic/) {
    if (-e $ncbi_gff) {return $ncbi_gff;}
  }

  return $gff;
}

sub get_po_path {
  my @tmppath = fileparse($0); # path to the C++-part of this program

  my $uname=`uname -s`;
  $uname=~s/[\r\n]+$//;
  $uname.="_".`uname -m`;
  $uname=~s/[\r\n]+$//;

  if(-x "proteinortho_clustering"){
    my $p=`whereis proteinortho_clustering`;
    $p=~s/^proteinortho_clustering:  *([^ ]+)\/proteinortho_clustering.*$/$1/;
    chomp($p);
    $tmppath[1]=$p;
    if($debug){print STDERR "Detected (PATH enviroment variable)\n";}
  }else{
    if(-x $tmppath[1]."/src/BUILD/$uname/proteinortho_clustering"){
      $tmppath[1]=$tmppath[1]."/src/BUILD/$uname";
      if($debug){print STDERR "Detected ".$tmppath[1]."\n";}
    }elsif(-x "/usr/bin/proteinortho_clustering"){
      $tmppath[1]="/usr/bin/";
      if($debug){print STDERR "Detected ".$tmppath[1]."\n";}
    }elsif(-x "/usr/local/bin/proteinortho_clustering"){
      $tmppath[1]="/usr/local/bin/";
      if($debug){print STDERR "Detected ".$tmppath[1]."\n";}
    }elsif(-x "$binpath/proteinortho_clustering"){
      $tmppath[1]="$binpath/";
      if($debug){print STDERR "Detected ".$tmppath[1]."\n";}
    }
  }

  if(!-x $tmppath[1]."/proteinortho_clustering"){
    &Error("cannot find proteinortho_clustering in: the current directory '.', ./src/, ./src/BUILD/$uname , /usr/bin, /usr/local/bin, \$PATH, -binpath=$binpath.\nPlease do one of the following:\n A. recompile proteinortho (with 'make clean', 'make' and 'make install' or 'make install PREFIX=...') or consider a installation with conda/brew (see the README for more informations)\n B. execute from within the downloaded directory, there are precompiled binaries for Linux_x86_64\n C. specify the path to the binaries with -binpath=...\n");
    exit 1;
  }
  # if(!-x "$tmppath[1]/proteinortho_cleanupblastgraph"){
  #   &Error("cannot find proteinortho_cleanupblastgraph in $tmppath[1].\nPlease do one of the following:\n A. recompile proteinortho (with 'make clean', 'make' and 'make install' or 'make install PREFIX=...') or consider a installation with conda/brew (see the README for more informations)\n B. execute from within the downloaded directory, there are precompiled binaries for Linux_x86_64\n C. specify the path to the binaries with -binpath=...\n";
  #   exit 0;
  # }
  # if(!-x "$tmppath[1]/po_tree"){
  #   &Error("cannot find po_tree in $tmppath[1].\nPlease do one of the following:\n A. recompile proteinortho (with 'make clean', 'make' and 'make install' or 'make install PREFIX=...') or consider a installation with conda/brew (see the README for more informations)\n B. execute from within the downloaded directory, there are precompiled binaries for Linux_x86_64\n C. specify the path to the binaries with -binpath=...\n";
  #   exit 0;
  # }
  if(!-x $tmppath[1]."/proteinortho2html.pl"){
   &Error("cannot find proteinortho2html.pl in: the current directory '.', ./src/, ./src/BUILD/$uname, /usr/bin, /usr/local/bin, \$PATH, -binpath=$binpath.\nPlease do one of the following:\n A. recompile proteinortho (with 'make clean', 'make' and 'make install' or 'make install PREFIX=...') or consider a installation with conda/brew (see the README for more informations)\n B. execute from within the downloaded directory, there are precompiled binaries for Linux_x86_64\n C. specify the path to the binaries with -binpath=...\n");
    exit 1;
  }
  # if(!-x "$tmppath[1]/proteinortho2tree.pl"){
  #   &Error("cannot find proteinortho2tree.pl in $tmppath[1].\nPlease do one of the following:\n A. recompile proteinortho (with 'make clean', 'make' and 'make install' or 'make install PREFIX=...') or consider a installation with conda/brew (see the README for more informations)\n B. execute from within the downloaded directory, there are precompiled binaries for Linux_x86_64\n C. specify the path to the binaries with -binpath=...\n";
  #   exit 0;
  # }
  if(!-x $tmppath[1]."/proteinortho_ffadj_mcs.py" && $synteny){
    &Error("cannot find proteinortho_ffadj_mcs.py$NC in: the current directory '.', ./src/, ./src/BUILD/$uname, /usr/bin, /usr/local/bin, \$PATH, -binpath=$binpath.\nPlease do one of the following:\n A. recompile proteinortho (with 'make clean', 'make' and 'make install' or 'make install PREFIX=...') or consider a installation with conda/brew (see the README for more informations)\n B. execute from within the downloaded directory, there are precompiled binaries for Linux_x86_64\n C. specify the path to the binaries with -binpath=...\n");
    exit 1;
  }
  if(!-x $tmppath[1]."/proteinortho_singletons.pl"){
    &Error("cannot find proteinortho_singletons.pl$NC in: the current directory '.', ./src/, ./src/BUILD/$uname, /usr/bin, /usr/local/bin, \$PATH, -binpath=$binpath.\nPlease do one of the following:\n A. recompile proteinortho (with 'make clean', 'make' and 'make install' or 'make install PREFIX=...') or consider a installation with conda/brew (see the README for more informations)\n B. execute from within the downloaded directory, there are precompiled binaries for Linux_x86_64\n C. specify the path to the binaries with -binpath=...\n");
    exit 1;
  }
  if(!-x $tmppath[1]."/proteinortho_graphMinusRemovegraph"){
    &Error("cannot find proteinortho_graphMinusRemovegraph$NC in: the current directory '.', ./src/, ./src/BUILD/$uname, /usr/bin, /usr/local/bin, \$PATH, -binpath=$binpath.\nPlease do one of the following:\n A. recompile proteinortho (with 'make clean', 'make' and 'make install' or 'make install PREFIX=...') or consider a installation with conda/brew (see the README for more informations)\n B. execute from within the downloaded directory, there are precompiled binaries for Linux_x86_64\n C. specify the path to the binaries with -binpath=...\n");
    exit 1;
  }
  if(!-x $tmppath[1]."/proteinortho_do_mcl.pl"){
    &Error("cannot find proteinortho_do_mcl.pl$NC in: the current directory '.', ./src/, ./src/BUILD/$uname, /usr/bin, /usr/local/bin, \$PATH, -binpath=$binpath.\nPlease do one of the following:\n A. recompile proteinortho (with 'make clean', 'make' and 'make install' or 'make install PREFIX=...') or consider a installation with conda/brew (see the README for more informations)\n B. execute from within the downloaded directory, there are precompiled binaries for Linux_x86_64\n C. specify the path to the binaries with -binpath=...\n");
      exit 1;
  }
  if(!-x $tmppath[1]."/proteinortho2xml.pl"){
    &Error("cannot find proteinortho2xml.pl$NC in: the current directory '.', ./src/, ./src/BUILD/$uname, /usr/bin, /usr/local/bin, \$PATH, -binpath=$binpath.\nPlease do one of the following:\n A. recompile proteinortho (with 'make clean', 'make' and 'make install' or 'make install PREFIX=...') or consider a installation with conda/brew (see the README for more informations)\n B. execute from within the downloaded directory, there are precompiled binaries for Linux_x86_64\n C. specify the path to the binaries with -binpath=...\n");
    exit 1;
  }
  return $tmppath[1];
}

sub edgeweight {
  # 1e-10 = 0.15, 1e-20 = 0.3, 1e-40 = 0.6, 1e-66+ = 1.0
  if ($_[0] == 0) {return 1;}
  my $x = -1*&log10($_[0])/100*1.5;
  if ($x > 1) {return 1;}
  if ($x <= 0) {return 0.0001;}
  return $x;
}

# sub log10 {
#   return log($_[0])/log(10);
# }

sub write_descriptions {
  if($verbose){print STDERR "Writing sequence descriptions\n";}
  open DESC, '>', $desctable;
  foreach my $file (@files) {
    if ($verbose) {print STDERR "Extracting descriptions from '$file'\t(".$gene_counter{$file}." entries)\n";}
    open FASTA, '<', $file;
    while (<FASTA>) {
      $_=~s/[\r\n]+$//;
      if (m/^>(\S+)(\s+(.*))?$/) {
        print DESC $1, "\t", ($3 || "unannotated sequence"), "\n";
      }
    }
  }
  if($verbose){print STDERR "[OUTPUT] -> written to $desctable\n";}

}

&reset_locale();
