Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Sean Solari
exPAM
Commits
d1d4c5da
Commit
d1d4c5da
authored
Apr 25, 2022
by
Sean Solari
Browse files
Bug fixes for database creation
parent
e0779df2
Changes
14
Hide whitespace changes
Inline
Side-by-side
MANIFEST.in
View file @
d1d4c5da
include LICENSE
include README.md
include MANIFEST.in
include src/expam/c/*.pyx
\ No newline at end of file
include MANIFEST.in
\ No newline at end of file
src/expam/__init__.py
deleted
100644 → 0
View file @
e0779df2
import
gzip
COMPRESSION_EXTNS
=
[
'.tar.gz'
,
'.tar'
,
'.gz'
]
DEFAULT_MODE
=
"rb"
DEFAULT_OPENER
=
open
COMP_PARSE
=
{
".tar.gz"
:
{
"mode"
:
"rb"
,
"opener"
:
gzip
.
open
},
".gz"
:
{
"mode"
:
"rb"
,
"opener"
:
gzip
.
open
}
}
from
.main
import
ExpamOptions
,
clear_logs
,
CommandGroup
,
PlotLogs
src/expam/cli/build.py
View file @
d1d4c5da
import
os
from
expam.main
import
CommandGroup
,
ExpamOptions
,
clear_logs
from
expam.
cli.
main
import
CommandGroup
,
ExpamOptions
,
clear_logs
from
expam.database
import
FileLocationConfig
from
expam.database.config
import
JSONConfig
,
create_database
,
make_database_config
,
validate_database_file_configuration
from
expam.logger
import
Timer
...
...
src/expam/cli/classify.py
View file @
d1d4c5da
...
...
@@ -3,7 +3,7 @@ from expam.classify import ResultsPathConfig
from
expam.classify.classify
import
ClassificationResults
,
name_to_id
,
run_classifier
from
expam.classify.config
import
make_results_config
,
validate_results_configuration
from
expam.classify.taxonomy
import
TaxonomyNCBI
from
expam.main
import
CommandGroup
,
ExpamOptions
,
clear_logs
from
expam.
cli.
main
import
CommandGroup
,
ExpamOptions
,
clear_logs
from
expam.database
import
FileLocationConfig
from
expam.database.config
import
JSONConfig
,
make_database_config
,
validate_database_file_configuration
from
expam.utils
import
die
,
is_hex
,
make_path_absolute
...
...
src/expam/cli/main.py
0 → 100644
View file @
d1d4c5da
from
argparse
import
ArgumentParser
,
Namespace
,
RawTextHelpFormatter
from
collections
import
namedtuple
import
datetime
import
os
import
shutil
import
matplotlib.pyplot
as
plt
import
numpy
as
np
from
expam.utils
import
die
,
ls
,
make_path_absolute
ExpamOptions
=
namedtuple
(
'ExpamOptions'
,
[
# Runtime arguments
'command'
,
'db_name'
,
'k'
,
'n'
,
's'
,
'phylogeny'
,
'alpha'
,
# Directory arguments
'directory'
,
'out_url'
,
'truth_dir'
,
# Parameter arguments
'length'
,
'pile'
,
'error_rate'
,
'first_n'
,
'paired_end'
,
# Summary arguments
'plot'
,
'cutoff'
,
'cpm'
,
'taxonomy'
,
# Plot arguments
'groups'
,
'phyla'
,
'keep_zeros'
,
'ignore_names'
,
'colour_list'
,
'rank'
,
'log_scores'
,
'itol_mode'
,
# Tree arguments
'use_sourmash'
,
'use_rapidnj'
,
'use_quicktree'
]
)
def
retrieve_arguments
()
->
ExpamOptions
:
parser
=
ArgumentParser
(
description
=
" expam CLI
\n
--------------
\n
"
,
formatter_class
=
RawTextHelpFormatter
)
parser
.
add_argument
(
"command"
,
default
=
None
,
help
=
'
\n
Command to execute. Valid commands include:
\n
'
'-------------------------------------------
\n
'
'create:-
\t
Initialise database.
\n
'
'build:-
\t\t
Start building database.
\n
'
'print:-
\t\t
Print current database parameters.
\n
'
'run:-
\t\t
Run reads against database.
\n
'
'add:-
\t\t
Add sequence to the database.
\n
'
'remove:-
\t
Remove sequence from database (only impacts future db builds).
\n
'
'set:-
\t\t
Set database build parameters.
\n
'
'to_taxonomy:-
\t\t
Convert results to taxonomic setting.
\n
'
'phylotree:-
\t\t
Draw results on phylotree.
\n
'
'draw_tree:-
\t\t
Draw the reference tree.
\n
'
'download_taxonomy:-
\t\t
Download taxonomic information for reference seqeunces.
\n
'
'cutoff:-
\t\t
Apply cutoff to some set of already processed classifications. THIS WILL OVERWRITE OLD RESULTS!
\n
'
'mashtree:-
\t
Create mashtree from current sequences and add to database.
\n
'
'quickrun:-
\t
Initialise, set parameters and start building db (assumes
\n
'
'
\t\t\t
sequences all lie in the same folder).
\n
'
'make_reads:-
\t
Uniformly sample reads of length l from some input sequence.
\n
'
'
\t\t
This is for testing purposes only, and is not a replacement
\n
'
'
\t\t
for actual read generating software.
\n
'
,
metavar
=
"[command]"
)
parser
.
add_argument
(
"-db"
,
"--db_name"
,
dest
=
"db_name"
,
help
=
"Name of database."
,
metavar
=
"[database name]"
)
parser
.
add_argument
(
"-k"
,
"--kmer"
,
dest
=
"k"
,
help
=
"Length of mer used for analysis."
,
metavar
=
"[k value (int)]"
)
parser
.
add_argument
(
"-n"
,
"--n-processes"
,
dest
=
"n"
,
help
=
"Number of CPUs to use for processing."
,
metavar
=
"[n (int)]"
)
parser
.
add_argument
(
"-s"
,
"--sketch"
,
dest
=
"s"
,
help
=
"Sketch size for mash."
,
metavar
=
"[sketch size (int)]"
)
parser
.
add_argument
(
"-p"
,
"--phylogeny"
,
dest
=
"phylogeny"
,
help
=
"URL of Newick file containing phylogeny."
,
metavar
=
"[phylogeny URL]"
)
parser
.
add_argument
(
"-d"
,
"--directory"
,
dest
=
"directory"
,
action
=
"append"
,
help
=
"File URL, context depending on command supplied."
,
metavar
=
"[directory]"
)
parser
.
add_argument
(
"-l"
,
"--length"
,
dest
=
"length"
,
help
=
"Length of simulated reads."
,
metavar
=
"[read length]"
)
parser
.
add_argument
(
"-o"
,
"--out"
,
dest
=
"out_url"
,
help
=
"Where to save classification results."
,
metavar
=
"[out URL]"
)
parser
.
add_argument
(
"-y"
,
"--pile"
,
dest
=
"pile"
,
help
=
"Number of genomes to pile at a time (or inf)."
,
metavar
=
"[pile size]"
)
parser
.
add_argument
(
"-e"
,
"--error-rate"
,
dest
=
"error_rate"
,
help
=
"Generate error in reads (error ~ reads with errors / reads)."
,
metavar
=
"[error rate]"
)
parser
.
add_argument
(
"-t"
,
"--truth"
,
dest
=
"truth_dir"
,
help
=
"Location of truth dataset."
)
parser
.
add_argument
(
"--plot"
,
dest
=
"plot"
,
default
=
False
,
action
=
"store_true"
,
help
=
"Plot timing data of database build."
)
parser
.
add_argument
(
"--first"
,
dest
=
"first_n"
,
default
=
None
,
help
=
"Add first n genomes in folder."
)
parser
.
add_argument
(
"--cutoff"
,
dest
=
"cutoff"
,
default
=
0
,
help
=
"Ignore organisms with less than `cutoff` reads in results."
)
parser
.
add_argument
(
"--cpm"
,
dest
=
"cpm"
,
default
=
100
,
help
=
"Counts/million cutoff for read-count to be non-negligible."
)
parser
.
add_argument
(
"--taxonomy"
,
dest
=
"taxonomy"
,
default
=
False
,
action
=
"store_true"
,
help
=
"Convert phylogenetic results to taxonomic results."
)
parser
.
add_argument
(
"--phyla"
,
dest
=
"phyla"
,
default
=
False
,
action
=
"store_true"
,
help
=
"Colour phylotree results by phyla."
)
parser
.
add_argument
(
"--rank"
,
dest
=
"rank"
,
default
=
None
,
help
=
"Rank at which to sort results."
)
parser
.
add_argument
(
"--keep-zeros"
,
dest
=
"keep_zeros"
,
default
=
False
,
action
=
"store_true"
,
help
=
"Keep nodes of output where no reads have been assigned."
)
parser
.
add_argument
(
"--ignore-names"
,
dest
=
"ignore_names"
,
default
=
False
,
action
=
"store_true"
)
parser
.
add_argument
(
"--group"
,
dest
=
"groups"
,
action
=
"append"
,
nargs
=
"+"
,
help
=
"Space-separated list of sample files to be treated as a single group in phylotree."
)
parser
.
add_argument
(
"--colour-list"
,
dest
=
"colour_list"
,
nargs
=
"+"
,
help
=
"List of colours to use when plotting groups in phylotree."
)
parser
.
add_argument
(
"--sourmash"
,
dest
=
"use_sourmash"
,
default
=
False
,
action
=
"store_true"
,
help
=
"Use sourmash for distance estimation."
)
parser
.
add_argument
(
"--rapidnj"
,
dest
=
"use_rapidnj"
,
default
=
True
,
action
=
"store_true"
,
help
=
"Use RapidNJ for Neighbour-Joining algorithm."
)
parser
.
add_argument
(
"--quicktree"
,
dest
=
"use_quicktree"
,
default
=
False
,
action
=
"store_true"
,
help
=
"Use QuickTree for Neighbour-Joining algorithm."
)
parser
.
add_argument
(
"--paired"
,
dest
=
"paired_end"
,
default
=
False
,
action
=
"store_true"
,
help
=
"Treat reads as paired-end."
)
parser
.
add_argument
(
"--alpha"
,
dest
=
"alpha"
,
default
=
0.1
,
help
=
"Percentage requirement for classification subtrees (see Tutorials 1 & 2)."
)
parser
.
add_argument
(
"--log-scores"
,
dest
=
"log_scores"
,
default
=
False
,
action
=
"store_true"
,
help
=
"Log transformation to opacity scores on phylotree (think uneven distributions)."
)
parser
.
add_argument
(
"--itol"
,
dest
=
"itol_mode"
,
default
=
False
,
action
=
"store_true"
,
help
=
"Output plotting data in ITOL format."
)
# Parse arguments.
args
:
Namespace
=
parser
.
parse_args
()
if
args
.
db_name
is
None
:
args
.
db_name
=
db_name_from_environment
()
else
:
args
.
db_name
=
make_path_absolute
(
args
.
db_name
,
os
.
getcwd
())
return
ExpamOptions
(
**
{
field
:
getattr
(
args
,
field
)
for
field
in
ExpamOptions
.
_fields
})
def
db_name_from_environment
():
try
:
return
os
.
environ
[
"EXPAM_DB_DIR"
]
except
KeyError
:
return
None
def
clear_logs
(
log_path
)
->
None
:
print
(
"Clearing old log files..."
)
try
:
shutil
.
rmtree
(
log_path
)
except
FileNotFoundError
:
pass
try
:
os
.
mkdir
(
log_path
)
except
OSError
:
die
(
"Can't make log path %s!"
%
log_path
)
class
CommandGroup
:
commands
:
set
[
str
]
=
{}
@
classmethod
def
take_args
(
cls
,
args
:
ExpamOptions
)
->
dict
:
return
{}
def
run
(
self
,
command
)
->
None
:
try
:
getattr
(
self
,
command
)()
except
AttributeError
:
raise
AttributeError
(
"Command %s not found!"
%
command
)
@
staticmethod
def
parse_ints
(
*
params
):
for
param
in
params
:
INVALID_PARAM_MSG
=
(
"Invalid parameter (%s), must be integer!"
%
str
(
param
))
if
param
is
not
None
:
try
:
# Convert to float.
param
=
float
(
param
)
except
ValueError
:
die
(
INVALID_PARAM_MSG
)
# Convert to int and see if the value changes.
new_param
=
int
(
param
)
if
new_param
!=
param
:
die
(
INVALID_PARAM_MSG
)
param
=
new_param
yield
param
@
staticmethod
def
parse_floats
(
*
params
):
for
param
in
params
:
INVALID_PARAM_MSG
=
(
"Invalid parameter (%s), must be integer!"
%
str
(
param
))
if
param
is
not
None
:
try
:
param
=
float
(
param
)
except
ValueError
:
die
(
INVALID_PARAM_MSG
)
yield
param
@
staticmethod
def
get_user_confirmation
(
msg
):
if
"y/n"
not
in
msg
:
msg
+=
" (y/n)"
while
True
:
ans
=
input
(
msg
)
ans
=
ans
.
lower
().
strip
()
if
ans
==
"y"
:
return
True
elif
ans
==
"n"
:
return
False
else
:
die
(
"Invalid response, please provide y/n."
)
@
staticmethod
def
get_time
(
data
)
->
datetime
.
datetime
:
date_data
,
time_data
,
am_pm
=
data
.
split
(
" "
)[:
3
]
mon
,
day
,
year
=
(
int
(
v
)
for
v
in
date_data
.
split
(
"/"
))
hr
,
min
,
sec
=
(
int
(
v
)
for
v
in
time_data
.
split
(
":"
))
if
"PM"
in
am_pm
and
hr
<
12
:
hr
+=
12
elif
"AM"
in
am_pm
and
hr
==
12
:
hr
=
0
return
datetime
.
datetime
(
year
,
mon
,
day
,
hr
,
min
,
sec
)
def
get_t0
(
self
,
logs_dir
:
str
):
def
_first
(
itr
):
try
:
return
itr
[
0
]
except
IndexError
:
raise
ValueError
(
"Can't find main log file!"
)
# Get effective t0 from main log.
main_log_dir
=
_first
([
os
.
path
.
join
(
logs_dir
,
log_name
)
for
log_name
in
ls
(
logs_dir
,
ext
=
".log"
)
if
"_main"
in
log_name
])
with
open
(
main_log_dir
,
"r"
)
as
f
:
log_data
=
f
.
readline
()
# Get time of first log point.
return
main_log_dir
,
self
.
get_time
(
log_data
)
class
PlotLogs
:
def
__init__
(
self
,
logs_dir
:
str
,
t0
:
datetime
.
datetime
)
->
None
:
self
.
logs_dir
=
logs_dir
self
.
t0
=
t0
def
plot
(
self
):
job_type_map
=
{
# For labels.
"request_extension"
:
"remap"
,
"disjoint"
:
"disjoint"
,
"collapse_pile"
:
"fill"
,
"import_sequence"
:
"kmers"
,
"send_kmers"
:
"send"
,
}
job_colours
=
{
# For plotting.
"request_extension"
:
"red"
,
"disjoint"
:
"blue"
,
"collapse_pile"
:
"green"
,
"import_sequence"
:
"orange"
,
"send_kmers"
:
"black"
,
}
process_type_jobs
=
{
"_extract"
:
[
"import_sequence"
,
"send_kmers"
,
],
"_union"
:
[
"request_extension"
,
"disjoint"
,
"collapse_pile"
],
}
fig
,
ax
=
plt
.
subplots
(
figsize
=
(
20
,
15
))
ax
.
grid
()
log_urls
=
[
os
.
path
.
join
(
self
.
logs_dir
,
log_name
)
for
log_name
in
ls
(
self
.
logs_dir
,
ext
=
".log"
)
if
"_main.log"
not
in
log_name
]
# Plot data one child process at a time.
for
log_url
in
log_urls
:
process_type
=
"_extract"
if
"_extract.log"
in
log_url
else
"_union"
# Data for plotting.
data
=
{
job_name
:
([],
[])
for
job_name
in
process_type_jobs
[
process_type
]}
with
open
(
log_url
,
"r"
)
as
f
:
log_data
=
f
.
readlines
()
# Read data line by line and append data.
for
i
,
line
in
enumerate
(
log_data
):
for
job_name
in
process_type_jobs
[
process_type
]:
if
job_name
in
line
:
data
[
job_name
][
0
].
append
(
self
.
_get_time
(
line
)
/
3600
)
data
[
job_name
][
1
].
append
(
self
.
_get_data
(
line
)
/
60
)
# Plot data on same graph.
for
job_name
in
process_type_jobs
[
process_type
]:
x
,
y
=
np
.
array
(
data
[
job_name
][
0
]),
np
.
array
(
data
[
job_name
][
1
])
ax
.
plot
(
x
,
y
,
label
=
job_type_map
[
job_name
],
marker
=
"x"
,
color
=
job_colours
[
job_name
],
)
# Don't allow duplicate labels.
job_type_map
[
job_name
]
=
""
ax
.
set
(
xlabel
=
"Time since start [hrs]"
,
ylabel
=
"Time taken for task [mins]"
,
title
=
"Time taken to do tasks"
)
ax
.
legend
(
loc
=
'best'
)
fig
.
savefig
(
os
.
path
.
join
(
self
.
logs_dir
,
"timing.png"
))
def
_get_time
(
self
,
string
):
date
,
current_time
,
am_pm
=
string
.
split
(
" "
)[:
3
]
mon
,
day
,
yr
=
(
int
(
v
)
for
v
in
date
.
split
(
"/"
))
hr
,
mn
,
sec
=
(
int
(
v
)
for
v
in
current_time
.
split
(
":"
))
if
"PM"
in
am_pm
and
hr
<
12
:
hr
+=
12
elif
"AM"
in
am_pm
and
hr
==
12
:
hr
=
0
time_point
=
datetime
.
datetime
(
yr
,
mon
,
day
,
hr
,
mn
,
sec
)
return
(
time_point
-
self
.
t0
).
total_seconds
()
@
staticmethod
def
_get_data
(
string
):
return
float
(
(
string
.
split
(
" took "
)[
1
])[:
-
3
]
)
src/expam/cli/tree.py
View file @
d1d4c5da
...
...
@@ -7,7 +7,7 @@ from expam.classify import ResultsPathConfig
from
expam.classify.classify
import
ClassificationResults
,
name_to_id
from
expam.classify.config
import
make_results_config
,
validate_results_configuration
from
expam.classify.taxonomy
import
TaxonomyNCBI
from
expam.main
import
CommandGroup
,
ExpamOptions
from
expam.
cli.
main
import
CommandGroup
,
ExpamOptions
from
expam.database
import
FileLocationConfig
from
expam.database.config
import
JSONConfig
,
make_database_config
,
validate_database_file_configuration
from
expam.tree
import
PHYLA_COLOURS
...
...
src/expam/cli/utils.py
View file @
d1d4c5da
...
...
@@ -2,7 +2,7 @@ import os
import
matplotlib.pyplot
as
plt
from
expam.classify
import
ResultsPathConfig
from
expam.classify.config
import
make_results_config
,
validate_classification_results
,
validate_results_configuration
from
expam.main
import
CommandGroup
,
ExpamOptions
from
expam.
cli.
main
import
CommandGroup
,
ExpamOptions
from
expam.database
import
FileLocationConfig
from
expam.database.config
import
JSONConfig
,
make_database_config
,
validate_database_file_configuration
from
expam.sequences
import
format_name
...
...
src/expam/database/__init__.py
View file @
d1d4c5da
...
...
@@ -30,7 +30,7 @@ DATABASE_FILE_RELATIVE_PATH = os.path.join(DATABASE_RELATIVE_PATH, "expam_db.h5"
FileLocationConfig
=
namedtuple
(
'FileLocationConfig'
,
[
'database'
,
'phylogeny'
,
'logs'
,
'conf'
,
'base'
,
'database'
,
'phylogeny'
,
'logs'
,
'conf'
,
'accession_id'
,
'taxid_lineage'
,
'taxon_rank'
,
'lca_matrix'
,
'database_file'
...
...
src/expam/database/build.py
View file @
d1d4c5da
...
...
@@ -5,13 +5,12 @@ import os
import
subprocess
import
numpy
as
np
from
expam
import
COMP_PARSE
from
expam.database
import
CHUNK_SIZE
,
TIMEOUT
,
UNION_RATIO
,
FileLocationConfig
,
expam_dtypes
from
expam.database.config
import
load_database_config
from
expam.process.genome
import
ExtractWorker
from
expam.process.manager
import
ControlCenter
,
ExpamProcesses
from
expam.process.piler
import
UnionWorker
from
expam.sequences
import
check_suffix
from
expam.sequences
import
COMP_PARSE
,
check_suffix
from
expam.tree.tree
import
Index
,
propose_lca
from
expam.utils
import
ls
...
...
src/expam/database/config.py
View file @
d1d4c5da
...
...
@@ -216,6 +216,7 @@ class ExpamDatabaseExistsError(Exception):
def
make_database_config
(
db_path
:
str
)
->
FileLocationConfig
:
database_file_locations
=
{
'base'
:
db_path
,
'database'
:
os
.
path
.
join
(
db_path
,
DATABASE_RELATIVE_PATH
),
'phylogeny'
:
os
.
path
.
join
(
db_path
,
PHYLOGENY_RELATIVE_PATH
),
'logs'
:
os
.
path
.
join
(
db_path
,
LOG_RELATIVE_PATH
),
...
...
@@ -239,7 +240,7 @@ def load_database_config(db_path: str) -> FileLocationConfig:
def
create_database
(
config
:
FileLocationConfig
)
->
None
:
for
field_to_check
in
(
'database'
,
'phylogeny'
,
'logs'
):
for
field_to_check
in
(
'base'
,
'database'
,
'phylogeny'
,
'logs'
):
path
:
str
=
getattr
(
config
,
field_to_check
)
if
not
os
.
path
.
exists
(
path
):
...
...
@@ -247,9 +248,13 @@ def create_database(config: FileLocationConfig) -> None:
else
:
raise
ExpamDatabaseExistsError
(
"Database %s already exists!"
%
config
.
database
)
# Create new configuration file.
conf
:
JSONConfig
=
JSONConfig
()
conf
.
save
(
url
=
config
.
conf
)
def
validate_database_file_configuration
(
proposed_config
:
FileLocationConfig
)
->
bool
:
for
field_to_check
in
(
'database'
,
'phylogeny'
,
'conf'
):
for
field_to_check
in
(
'base'
,
'database'
,
'phylogeny'
,
'conf'
):
if
not
os
.
path
.
exists
(
getattr
(
proposed_config
,
field_to_check
)):
return
False
else
:
...
...
src/expam/ext/kmers/__init__py
→
src/expam/ext/kmers/__init__
.
py
View file @
d1d4c5da
File moved
src/expam/main.py
View file @
d1d4c5da
from
argparse
import
ArgumentParser
,
Namespace
,
RawTextHelpFormatter
from
collections
import
namedtuple
import
datetime
import
multiprocessing
import
os
import
platform
import
shutil
import
matplotlib.pyplot
as
plt
import
numpy
as
np
from
expam.cli.build
import
BuildCommand
from
expam.cli.classify
import
ClassifyCommand
from
expam.cli.main
import
CommandGroup
,
ExpamOptions
,
retrieve_arguments
from
expam.cli.tree
import
TreeCommand
from
expam.utils
import
die
,
ls
,
make_path_absolute
ExpamOptions
=
namedtuple
(
'ExpamOptions'
,
[
# Runtime arguments
'command'
,
'db_name'
,
'k'
,
'n'
,
's'
,
'phylogeny'
,
'alpha'
,
# Directory arguments
'directory'
,
'out_url'
,
'truth_dir'
,
# Parameter arguments
'length'
,
'pile'
,
'error_rate'
,
'first_n'
,
'sketch'
,
'paired_end'
,
# Summary arguments
'plot'
,
'cutoff'
,
'cpm'
,
'taxonomy'
,
# Plot arguments
'groups'
,
'phyla'
,
'keep_zeros'
,
'ignore_names'
,
'colour_list'
,
'rank'
,
'log_scores'
,
'itol_mode'
,
# Tree arguments
'use_sourmash'
,
'use_rapidnj'
,
'use_quicktree'
]
)
def
retrieve_arguments
()
->
ExpamOptions
:
parser
=
ArgumentParser
(
description
=
" expam CLI
\n
--------------
\n
"
,
formatter_class
=
RawTextHelpFormatter
)
parser
.
add_argument
(
"command"
,
default
=
None
,
help
=
'
\n
Command to execute. Valid commands include:
\n
'
'-------------------------------------------
\n
'
'create:-
\t
Initialise database.
\n
'
'build:-
\t\t
Start building database.
\n
'
'print:-
\t\t
Print current database parameters.
\n
'
'run:-
\t\t
Run reads against database.
\n
'
'add:-
\t\t
Add sequence to the database.
\n
'
'remove:-
\t
Remove sequence from database (only impacts future db builds).
\n
'
'set:-
\t\t
Set database build parameters.
\n
'
'to_taxonomy:-
\t\t
Convert results to taxonomic setting.
\n
'
'phylotree:-
\t\t
Draw results on phylotree.
\n
'
'draw_tree:-
\t\t
Draw the reference tree.
\n
'
'download_taxonomy:-
\t\t
Download taxonomic information for reference seqeunces.
\n
'
'cutoff:-
\t\t
Apply cutoff to some set of already processed classifications. THIS WILL OVERWRITE OLD RESULTS!
\n
'
'mashtree:-
\t
Create mashtree from current sequences and add to database.
\n
'
'quickrun:-
\t
Initialise, set parameters and start building db (assumes
\n
'
'
\t\t\t
sequences all lie in the same folder).
\n
'
'make_reads:-
\t
Uniformly sample reads of length l from some input sequence.
\n
'
'
\t\t
This is for testing purposes only, and is not a replacement
\n
'
'
\t\t
for actual read generating software.
\n
'
,
metavar
=
"[command]"
)
parser
.
add_argument
(
"-db"
,
"--db_name"
,
dest
=
"db_name"
,
help
=
"Name of database."
,
metavar
=
"[database name]"
)
parser
.
add_argument
(
"-k"
,
"--kmer"
,
dest
=
"k"
,
help
=
"Length of mer used for analysis."
,
metavar
=
"[k value (int)]"
)
parser
.
add_argument
(
"-n"
,
"--n-processes"
,
dest
=
"n"
,
help
=
"Number of CPUs to use for processing."
,
metavar
=
"[n (int)]"
)
parser
.
add_argument
(
"-s"
,
"--sketch"
,
dest
=
"sketch"
,
help
=
"Sketch size for mash."
,
metavar
=
"[sketch size (int)]"
)
parser
.
add_argument
(
"-p"
,
"--phylogeny"
,
dest
=
"phylogeny"
,
help
=
"URL of Newick file containing phylogeny."
,
metavar
=
"[phylogeny URL]"
)
parser
.
add_argument
(
"-d"
,
"--directory"
,
dest
=
"directory"
,
action
=
"append"
,
help
=
"File URL, context depending on command supplied."
,
metavar
=
"[directory]"
)
parser
.
add_argument
(
"-l"
,
"--length"
,
dest
=
"length"
,
help
=
"Length of simulated reads."
,
metavar
=
"[read length]"
)
parser
.
add_argument
(
"-o"
,
"--out"
,
dest
=
"out_url"
,
help
=
"Where to save classification results."
,
metavar
=
"[out URL]"
)
parser
.
add_argument
(
"-y"
,
"--pile"
,
dest
=
"pile"
,
help
=
"Number of genomes to pile at a time (or inf)."
,
metavar
=
"[pile size]"
)
parser
.
add_argument
(
"-e"
,
"--error-rate"
,
dest
=
"error_rate"
,
help
=
"Generate error in reads (error ~ reads with errors / reads)."
,
metavar
=
"[error rate]"
)
parser
.
add_argument
(
"-t"
,
"--truth"
,
dest
=
"truth_dir"
,
help
=
"Location of truth dataset."
)
parser
.
add_argument
(
"--plot"
,
dest
=
"plot"
,
default
=
False
,
action
=
"store_true"
,
help
=
"Plot timing data of database build."
)
parser
.
add_argument
(
"--first"
,
dest
=
"first_n"
,
default
=
None
,
help
=
"Add first n genomes in folder."
)
parser
.
add_argument
(
"--cutoff"
,
dest
=
"cutoff"
,
default
=
0
,
help
=
"Ignore organisms with less than `cutoff` reads in results."
)
parser
.
add_argument
(
"--cpm"
,
dest
=
"cpm"
,
default
=
100
,
help
=
"Counts/million cutoff for read-count to be non-negligible."
)
parser
.
add_argument
(
"--taxonomy"
,
dest
=
"taxonomy"
,
default
=
False
,
action
=
"store_true"
,
help
=
"Convert phylogenetic results to taxonomic results."
)
parser
.
add_argument
(
"--phyla"
,
dest
=
"phyla"
,
default
=
False
,
action
=
"store_true"
,
help
=
"Colour phylotree results by phyla."
)
parser
.
add_argument
(
"--rank"
,
dest
=
"rank"
,
default
=
None
,
help
=
"Rank at which to sort results."
)
parser
.
add_argument
(
"--keep-zeros"
,
dest
=
"keep_zeros"
,
default
=
False
,
action
=
"store_true"
,
help
=
"Keep nodes of output where no reads have been assigned."
)
parser
.
add_argument
(
"--ignore-names"
,
dest
=
"ignore_node_names"
,
default
=
False
,
action
=
"store_true"
)
parser
.
add_argument
(
"--group"
,
dest
=
"groups"
,
action
=
"append"
,
nargs
=
"+"
,
help
=
"Space-separated list of sample files to be treated as a single group in phylotree."
)
parser
.
add_argument
(
"--colour-list"
,
dest
=
"colour_list"
,
nargs
=
"+"
,
help
=
"List of colours to use when plotting groups in phylotree."
)
parser
.
add_argument
(
"--circle-scale"
,
dest
=
"circle_scale"
,
default
=
1.0
,
help
=
"Scale of circles that represent splits in phylotree."
)
parser
.
add_argument
(
"--sourmash"
,
dest
=
"use_sourmash"
,
default
=
False
,
action
=
"store_true"
,
help
=
"Use sourmash for distance estimation."
)
parser
.
add_argument
(
"--rapidnj"
,
dest
=
"use_rapidnj"
,
default
=
True
,
action
=
"store_true"
,
help
=
"Use RapidNJ for Neighbour-Joining algorithm."
)
parser
.
add_argument
(
"--quicktree"
,
dest
=
"use_quicktree"
,
default
=
False
,
action
=
"store_true"
,
help
=
"Use QuickTree for Neighbour-Joining algorithm."
)
parser
.
add_argument
(
"--paired"
,
dest
=
"paired_end"
,
default
=
False
,
action
=
"store_true"
,
help
=
"Treat reads as paired-end."
)
parser
.
add_argument
(
"--alpha"
,
dest
=
"alpha"
,
default
=
0.1
,