Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Sean Solari
exPAM
Commits
0b66ec7a
Commit
0b66ec7a
authored
Mar 14, 2022
by
Sean Solari
Browse files
Grouping specification changes. Added log scale option
parent
2a61fae2
Changes
5
Hide whitespace changes
Inline
Side-by-side
setup.py
View file @
0b66ec7a
...
...
@@ -90,7 +90,7 @@ setup(
entry_points
=
{
"console_scripts"
:
[
"expam=expam.cli:main"
,
"expamlimit=expam.sandbox:main"
"expam
_
limit=expam.sandbox:main"
],
},
#
...
...
src/expam/classification.py
View file @
0b66ec7a
import
datetime
from
genericpath
import
isfile
import
os
import
re
import
requests
import
shutil
import
time
from
ctypes
import
*
from
math
import
floor
...
...
@@ -93,7 +91,7 @@ def string_to_tuple(st):
def
run_classifier
(
read_paths
,
out_dir
,
db_dir
,
k
,
n
,
phylogeny_path
,
keys_shape
,
values_shape
,
logging_dir
,
taxonomy
=
False
,
cutoff
=
0.0
,
groups
=
None
,
keep_zeros
=
False
,
cpm
=
0.0
,
use_node_names
=
True
,
phyla
=
False
,
name_taxa
=
None
,
colour_list
=
None
,
circle_scale
=
1.0
,
paired_end
=
False
,
alpha
=
1.0
):
paired_end
=
False
,
alpha
=
1.0
,
log_scores
=
False
):
# Verify results path.
if
not
os
.
path
.
exists
(
out_dir
):
...
...
@@ -166,7 +164,8 @@ def run_classifier(read_paths, out_dir, db_dir, k, n, phylogeny_path, keys_shape
phyla
=
phyla
,
name_taxa
=
name_taxa
,
colour_list
=
colour_list
,
circle_scale
=
circle_scale
circle_scale
=
circle_scale
,
log_scores
=
log_scores
)
if
taxonomy
:
...
...
@@ -1107,14 +1106,15 @@ class Distribution:
class
ClassificationResults
:
def
__init__
(
self
,
index
,
phylogeny_index
,
in_dir
,
out_dir
,
groups
=
None
,
keep_zeros
=
False
,
cutoff
=
0.0
,
cpm
=
0.0
,
use_node_names
=
True
,
phyla
=
False
,
name_taxa
=
None
,
colour_list
=
None
,
circle_scale
=
1.0
):
use_node_names
=
True
,
phyla
=
False
,
name_taxa
=
None
,
colour_list
=
None
,
circle_scale
=
1.0
,
log_scores
=
False
):
self
.
index
=
index
self
.
phylogeny_index
=
phylogeny_index
self
.
in_dir
=
in_dir
self
.
out_dir
=
out_dir
self
.
groups
=
groups
# [(colour, (name1, name2, ...)),
(colour, (name3, ...)),
...]
self
.
groups
=
groups
# [(colour, (name1, name2, ...)), ...]
self
.
keep_zeros
=
keep_zeros
self
.
cutoff
=
cutoff
# Cutoff by counts.
...
...
@@ -1125,6 +1125,7 @@ class ClassificationResults:
self
.
colour_list
=
colour_list
self
.
name_taxa
=
name_taxa
self
.
circle_scale
=
circle_scale
self
.
log_scores
=
log_scores
if
self
.
phyla
and
self
.
name_taxa
is
None
:
raise
Exception
(
"Inclusion of phyla requires mapping of names to taxonomic lineages!"
)
...
...
@@ -1312,7 +1313,8 @@ class ClassificationResults:
name_to_taxon
=
self
.
name_taxa
,
use_phyla
=
self
.
phyla
,
keep_zeros
=
self
.
keep_zeros
,
use_node_names
=
self
.
use_node_names
use_node_names
=
self
.
use_node_names
,
log_scores
=
self
.
log_scores
)
# Draw unclassified tree.
...
...
@@ -1326,5 +1328,6 @@ class ClassificationResults:
name_to_taxon
=
self
.
name_taxa
,
use_phyla
=
self
.
phyla
,
keep_zeros
=
self
.
keep_zeros
,
use_node_names
=
self
.
use_node_names
use_node_names
=
self
.
use_node_names
,
log_scores
=
self
.
log_scores
)
src/expam/cli.py
View file @
0b66ec7a
...
...
@@ -900,8 +900,10 @@ def main():
help
=
"Use QuickTree for Neighbour-Joining algorithm."
)
parser
.
add_argument
(
"--paired"
,
dest
=
"paired_end"
,
default
=
False
,
action
=
"store_true"
,
help
=
"Treat reads as paired-end."
)
parser
.
add_argument
(
"--alpha"
,
dest
=
"alpha"
,
default
=
None
,
parser
.
add_argument
(
"--alpha"
,
dest
=
"alpha"
,
default
=
0.1
,
help
=
"Percentage requirement for classification subtrees (see Tutorials 1 & 2)."
)
parser
.
add_argument
(
"--log-scores"
,
dest
=
"log_scores"
,
default
=
False
,
action
=
"store_true"
,
help
=
"Log transformation to opacity scores on phylotree (think uneven distributions)."
)
# Parse arguments.
args
=
parser
.
parse_args
()
...
...
@@ -912,14 +914,14 @@ def main():
param_args
=
args
.
length
,
args
.
pile
,
args
.
error_rate
,
args
.
first_n
,
args
.
sketch
,
args
.
paired_end
summary_args
=
args
.
plot
,
args
.
cutoff
,
args
.
cpm
,
args
.
taxonomy
plot_args
=
args
.
groups
,
args
.
phyla
,
args
.
keep_zeros
,
not
args
.
ignore_node_names
,
args
.
colour_list
,
\
args
.
circle_scale
,
args
.
rank
args
.
circle_scale
,
args
.
rank
,
args
.
log_scores
tree_args
=
args
.
use_sourmash
,
args
.
use_rapidnj
,
args
.
use_quicktree
command
,
db_name
,
k
,
n
,
phylogeny
,
alpha
=
runtime_args
directories
,
out_dir
,
truth_dir
=
directory_args
length
,
pile_size
,
error_rate
,
first_n
,
sketch
,
paired_end
=
param_args
plot
,
cutoff
,
cpm
,
taxonomy
=
summary_args
groups
,
plot_phyla
,
keep_zeros
,
use_node_names
,
colour_list
,
circle_scale
,
at_rank
=
plot_args
groups
,
plot_phyla
,
keep_zeros
,
use_node_names
,
colour_list
,
circle_scale
,
at_rank
,
log_scores
=
plot_args
use_sourmash
,
use_rapidnj
,
use_quicktree
=
tree_args
group
=
None
if
groups
is
None
else
groups
[
0
][
0
]
# When referring to sequence groups, not plotting groups.
...
...
@@ -932,7 +934,6 @@ def main():
except
KeyError
:
die
(
"A database name is required!"
)
return
else
:
DB_DIR
=
make_path_absolute
(
str
(
db_name
),
CWD
)
...
...
@@ -1089,7 +1090,8 @@ def main():
colour_list
=
colour_list
,
circle_scale
=
circle_scale
,
paired_end
=
paired_end
,
alpha
=
alpha
alpha
=
alpha
,
log_scores
=
log_scores
)
#
...
...
@@ -1144,7 +1146,8 @@ def main():
phyla
=
plot_phyla
,
name_taxa
=
name_to_lineage
,
colour_list
=
colour_list
,
circle_scale
=
circle_scale
circle_scale
=
circle_scale
,
log_scores
=
log_scores
)
results
.
to_taxonomy
(
name_to_lineage
,
taxon_to_rank
,
tax_results_path
)
...
...
@@ -1187,7 +1190,8 @@ def main():
phyla
=
plot_phyla
,
name_taxa
=
name_to_lineage
,
colour_list
=
colour_list
,
circle_scale
=
circle_scale
circle_scale
=
circle_scale
,
log_scores
=
log_scores
)
results
.
draw_results
()
...
...
src/expam/run.py
View file @
0b66ec7a
...
...
@@ -953,7 +953,6 @@ def main(out_dir, genome_paths, phylogeny_path, k, n=None, n_extract=None,
# Import phylogeny.
genome_ids
,
index
=
import_phylogeny
(
phylogeny_path
)
# Create LCA matrix.
format_name
=
expam
.
sequences
.
format_name
node_to_index
=
{
node
.
name
:
i
for
i
,
node
in
enumerate
(
index
.
pool
)
if
i
>
0
}
lca_matrix
=
make_lca_matrix
(
index
,
node_to_index
)
...
...
@@ -964,14 +963,14 @@ def main(out_dir, genome_paths, phylogeny_path, k, n=None, n_extract=None,
# Multiprocessing configuration.
mp_config
=
{
"name"
:
"expam"
,
# Job system title.
"phases"
:
[
# Two-pass algorithm.
"import"
,
# Import sequences and take union.
"map"
# Map kmer to LCA.
"name"
:
"expam"
,
# Job system title.
"phases"
:
[
# Two-pass algorithm.
"import"
,
# Import sequences and take union.
"map"
# Map kmer to LCA.
],
"layers"
:
{
# Top layer of child processes - extract sequence & kmers.
"layers"
:
{
# Top layer of child processes - extract sequence & kmers.
"class"
:
expam
.
processes
.
ExtractWorker
,
"class_args"
:
{
# Positional arguments to init class.
"class_args"
:
{
# Positional arguments to init class.
"k"
:
k
,
"shm_kmer_params"
:
shm_allocations
,
"kmer_ranges"
:
ranges
,
...
...
@@ -979,23 +978,23 @@ def main(out_dir, genome_paths, phylogeny_path, k, n=None, n_extract=None,
"out_dir"
:
out_dir
,
"logging_dir"
:
logs_dir
,
},
"parent"
:
ControlCenter
,
# Receive work from main process.
"parent"
:
ControlCenter
,
# Receive work from main process.
"child"
:
{
"class"
:
expam
.
processes
.
UnionWorker
,
# Take union of sets of kmers.
"class_args"
:
{
# Each either length n_union or 1.
"class"
:
expam
.
processes
.
UnionWorker
,
# Take union of sets of kmers.
"class_args"
:
{
# Each either length n_union or 1.
"main_con"
:
union_cons
,
"lca_matrix"
:
lca_matrix
,
"pile_size"
:
pile_size
,
"logging_dir"
:
logs_dir
,
"lock_value"
:
lock_value
,
},
"parent"
:
expam
.
processes
.
ExtractWorker
,
# Receive work from ExtractWorkers.
"parent"
:
expam
.
processes
.
ExtractWorker
,
# Receive work from ExtractWorkers.
"child"
:
None
,
# Bottom of process hierarchy.
"n"
:
n_union
,
# Number of processes.
"n"
:
n_union
,
# Number of processes.
},
"n"
:
n_extract
,
},
"timeout"
:
TIMEOUT
,
# Time to block for when checking queues.
"timeout"
:
TIMEOUT
,
# Time to block for when checking queues.
}
process_network
=
ExpamProcesses
.
from_method_dict
(
main_cons
,
out_dir
,
logs_dir
,
mp_config
)
...
...
src/expam/tree.py
View file @
0b66ec7a
import
random
from
math
import
floor
from
math
import
floor
,
log
import
json
import
os
import
traceback
...
...
@@ -768,14 +768,14 @@ class Index:
return
list
(
self
.
yield_leaves
(
node_name
))
def
draw_results
(
self
,
file_path
,
out_dir
,
skiprows
=
None
,
groups
=
None
,
cutoff
=
None
,
cpm
=
None
,
colour_list
=
None
,
name_to_taxon
=
None
,
use_phyla
=
False
,
keep_zeros
=
True
,
use_node_names
=
True
):
name_to_taxon
=
None
,
use_phyla
=
False
,
keep_zeros
=
True
,
use_node_names
=
True
,
log_scores
=
False
):
counts
=
pd
.
read_csv
(
file_path
,
sep
=
'
\t
'
,
index_col
=
0
,
header
=
0
,
skiprows
=
skiprows
)
self
.
draw_tree
(
out_dir
,
counts
=
counts
,
groups
=
groups
,
cutoff
=
cutoff
,
cpm
=
cpm
,
colour_list
=
colour_list
,
name_to_taxon
=
name_to_taxon
,
use_phyla
=
use_phyla
,
keep_zeros
=
keep_zeros
,
use_node_names
=
use_node_names
)
use_node_names
=
use_node_names
,
log_scores
=
log_scores
)
def
draw_tree
(
self
,
out_dir
,
counts
=
None
,
groups
=
None
,
cutoff
=
None
,
cpm
=
None
,
colour_list
=
None
,
name_to_taxon
=
None
,
use_phyla
=
False
,
keep_zeros
=
True
,
use_node_names
=
True
):
use_phyla
=
False
,
keep_zeros
=
True
,
use_node_names
=
True
,
log_scores
=
True
):
try
:
import
ete3.coretype.tree
from
ete3
import
AttrFace
,
faces
,
Tree
,
TreeStyle
,
NodeStyle
,
TextFace
...
...
@@ -787,11 +787,13 @@ class Index:
return
from
expam.sequences
import
format_name
"""
Phylogenetic printing of nodes.
"""
for
node
in
self
.
pool
:
if
node
.
type
==
'Branch'
and
node
.
name
[
0
]
!=
'p'
:
if
node
.
type
==
'Branch'
and
not
node
.
name
.
startswith
(
'p'
)
:
node
.
name
=
'p'
+
node
.
name
"""
...
...
@@ -805,23 +807,42 @@ class Index:
"""
if
groups
is
None
:
groups
=
[(
None
,
(
section
,))
for
section
in
counts
.
columns
]
else
:
# Format group names.
groups
=
[
(
col
,
tuple
(
format_name
(
group_member
,
remove_comp
=
True
)
for
group_member
in
group
)
)
for
col
,
group
in
groups
]
# *** Relies on counts being defined.
if
counts
is
not
None
:
# Combine counts within the group.
for
_
,
group
in
groups
:
if
len
(
group
)
>
1
:
group_name
=
group
[
0
]
# Conglomerate counts within the group.
counts
.
loc
[:,
group_name
]
=
counts
[
list
(
group
)].
sum
(
axis
=
1
)
counts
.
drop
(
labels
=
list
(
groups
[
1
:]),
axis
=
1
,
inplace
=
True
)
sections
=
counts
.
columns
.
tolist
()
nodes
=
counts
.
index
.
tolist
()
# Remove any groups that weren't specified.
all_groups
=
counts
.
columns
.
tolist
()
specified_groups
=
set
(
group
[
0
]
for
_
,
group
in
groups
)
unspecified_groups
=
[
group
for
group
in
all_groups
if
group
not
in
specified_groups
]
counts
.
drop
(
labels
=
unspecified_groups
,
axis
=
1
,
inplace
=
True
)
"""
Employ cutoff.
"""
sections
=
list
(
specified_groups
)
nodes
=
counts
.
index
.
tolist
()
if
cutoff
is
None
and
cpm
is
None
:
nodes_with_counts
=
nodes
...
...
@@ -854,8 +875,15 @@ class Index:
Colour generation.
"""
max_vector
=
tuple
(
counts
.
max
())
min_vector
=
tuple
(
counts
.
min
())
colours
=
[
colour
for
colour
,
_
in
groups
]
colour_generator
=
ColourGenerator
(
colours
,
max_vector
,
colour_list
=
colour_list
)
colour_generator
=
ColourGenerator
(
colours
,
max_vector
,
min_vector
,
colour_list
=
colour_list
,
log_scores
=
log_scores
)
"""
Function to render any given node.
...
...
@@ -881,7 +909,7 @@ class Index:
node_counts
=
tuple
(
counts
.
loc
[
node
.
name
,
:])
if
any
(
node_counts
):
colour
,
_
=
colour_generator
.
generate
(
node_counts
)
colour
=
colour_generator
.
generate
(
node_counts
)
ns
=
NodeStyle
()
ns
[
'bgcolor'
]
=
colour
...
...
@@ -1000,16 +1028,14 @@ class RandomColour:
class
ColourGenerator
:
def
__init__
(
self
,
colours
,
max_vector
,
colour_list
=
Non
e
):
def
__init__
(
self
,
colours
,
max_vector
,
min_vector
=
None
,
colour_list
=
None
,
log_score
=
Fals
e
):
"""
:param colours:
:param max_vector:
:param colour_list:
"""
"""
Ensure we have enough colours for plotting.
"""
# Ensure we have enough colours for plotting.
g
=
RandomColour
()
# Ensure we have enough colours for each group.
...
...
@@ -1032,19 +1058,30 @@ class ColourGenerator:
#
self
.
max_vector
=
max_vector
self
.
min_vector
=
min_vector
self
.
make_log_score
=
log_score
if
self
.
make_log_score
and
self
.
min_vector
is
None
:
raise
AttributeError
(
"Log scores requested but not min vector supplied!"
)
# Calculate normalisation factors.
self
.
_log_norm_factor
=
[
log
(
mx
/
mn
)
for
mx
,
mn
in
zip
(
self
.
max_vector
,
self
.
min_vector
)]
if
self
.
make_log_score
else
None
def
_calculate_colour
(
self
,
score_vector
,
scorer
):
def
generate
(
self
,
score_vector
):
colour
=
HexColour
(
"#FFFFFF"
)
# White background.
scorer
=
self
.
_linear_scores
if
not
self
.
make_log_score
else
self
.
_log_scores
# Artificial scaling factor.
non_zero_indices
=
[
i
for
i
,
score
in
enumerate
(
score_vector
)
if
score
>
0
]
alpha
=
sum
([
score_vector
[
i
]
for
i
in
non_zero_indices
])
/
sum
([
scorer
[
i
]
for
i
in
non_zero_indices
])
for
next_colour
,
opacity
in
scorer
(
score_vector
):
colour
+=
next_colour
.
opaque
(
opacity
)
for
i
,
score
in
enumerate
(
score_vector
):
if
score
>
0
:
colour
+=
self
.
colours
[
i
].
opaque
(
score
/
scorer
[
i
])
return
str
(
colour
)
return
str
(
colour
),
alpha
def
_linear_scores
(
self
,
vector
):
for
i
,
score
in
enumerate
(
vector
):
if
score
>
0
:
yield
self
.
colours
[
i
],
score
/
self
.
max_vector
[
i
]
def
generate
(
self
,
vector
):
return
self
.
_calculate_colour
(
vector
,
self
.
max_vector
)
def
_log_scores
(
self
,
vector
):
for
i
,
score
in
enumerate
(
vector
):
if
score
>
0
:
yield
self
.
colours
[
i
],
log
(
score
/
self
.
min_vector
[
i
])
/
self
.
_log_norm_factor
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment