Commit 0b66ec7a authored by Sean Solari's avatar Sean Solari
Browse files

Grouping specification changes. Added log scale option

parent 2a61fae2
......@@ -90,7 +90,7 @@ setup(
entry_points={
"console_scripts": [
"expam=expam.cli:main",
"expamlimit=expam.sandbox:main"
"expam_limit=expam.sandbox:main"
],
},
#
......
import datetime
from genericpath import isfile
import os
import re
import requests
import shutil
import time
from ctypes import *
from math import floor
......@@ -93,7 +91,7 @@ def string_to_tuple(st):
def run_classifier(read_paths, out_dir, db_dir, k, n, phylogeny_path, keys_shape, values_shape, logging_dir,
taxonomy=False, cutoff=0.0, groups=None, keep_zeros=False, cpm=0.0, use_node_names=True,
phyla=False, name_taxa=None, colour_list=None, circle_scale=1.0,
paired_end=False, alpha=1.0):
paired_end=False, alpha=1.0, log_scores=False):
# Verify results path.
if not os.path.exists(out_dir):
......@@ -166,7 +164,8 @@ def run_classifier(read_paths, out_dir, db_dir, k, n, phylogeny_path, keys_shape
phyla=phyla,
name_taxa=name_taxa,
colour_list=colour_list,
circle_scale=circle_scale
circle_scale=circle_scale,
log_scores=log_scores
)
if taxonomy:
......@@ -1107,14 +1106,15 @@ class Distribution:
class ClassificationResults:
def __init__(self, index, phylogeny_index, in_dir, out_dir, groups=None, keep_zeros=False, cutoff=0.0, cpm=0.0,
use_node_names=True, phyla=False, name_taxa=None, colour_list=None, circle_scale=1.0):
use_node_names=True, phyla=False, name_taxa=None, colour_list=None, circle_scale=1.0,
log_scores=False):
self.index = index
self.phylogeny_index = phylogeny_index
self.in_dir = in_dir
self.out_dir = out_dir
self.groups = groups # [(colour, (name1, name2, ...)), (colour, (name3, ...)), ...]
self.groups = groups # [(colour, (name1, name2, ...)), ...]
self.keep_zeros = keep_zeros
self.cutoff = cutoff # Cutoff by counts.
......@@ -1125,6 +1125,7 @@ class ClassificationResults:
self.colour_list = colour_list
self.name_taxa = name_taxa
self.circle_scale = circle_scale
self.log_scores = log_scores
if self.phyla and self.name_taxa is None:
raise Exception("Inclusion of phyla requires mapping of names to taxonomic lineages!")
......@@ -1312,7 +1313,8 @@ class ClassificationResults:
name_to_taxon=self.name_taxa,
use_phyla=self.phyla,
keep_zeros=self.keep_zeros,
use_node_names=self.use_node_names
use_node_names=self.use_node_names,
log_scores=self.log_scores
)
# Draw unclassified tree.
......@@ -1326,5 +1328,6 @@ class ClassificationResults:
name_to_taxon=self.name_taxa,
use_phyla=self.phyla,
keep_zeros=self.keep_zeros,
use_node_names=self.use_node_names
use_node_names=self.use_node_names,
log_scores=self.log_scores
)
......@@ -900,8 +900,10 @@ def main():
help="Use QuickTree for Neighbour-Joining algorithm.")
parser.add_argument("--paired", dest="paired_end", default=False, action="store_true",
help="Treat reads as paired-end.")
parser.add_argument("--alpha", dest="alpha", default=None,
parser.add_argument("--alpha", dest="alpha", default=0.1,
help="Percentage requirement for classification subtrees (see Tutorials 1 & 2).")
parser.add_argument("--log-scores", dest="log_scores", default=False, action="store_true",
help="Log transformation to opacity scores on phylotree (think uneven distributions).")
# Parse arguments.
args = parser.parse_args()
......@@ -912,14 +914,14 @@ def main():
param_args = args.length, args.pile, args.error_rate, args.first_n, args.sketch, args.paired_end
summary_args = args.plot, args.cutoff, args.cpm, args.taxonomy
plot_args = args.groups, args.phyla, args.keep_zeros, not args.ignore_node_names, args.colour_list, \
args.circle_scale, args.rank
args.circle_scale, args.rank, args.log_scores
tree_args = args.use_sourmash, args.use_rapidnj, args.use_quicktree
command, db_name, k, n, phylogeny, alpha = runtime_args
directories, out_dir, truth_dir = directory_args
length, pile_size, error_rate, first_n, sketch, paired_end = param_args
plot, cutoff, cpm, taxonomy = summary_args
groups, plot_phyla, keep_zeros, use_node_names, colour_list, circle_scale, at_rank = plot_args
groups, plot_phyla, keep_zeros, use_node_names, colour_list, circle_scale, at_rank, log_scores = plot_args
use_sourmash, use_rapidnj, use_quicktree = tree_args
group = None if groups is None else groups[0][0] # When referring to sequence groups, not plotting groups.
......@@ -932,7 +934,6 @@ def main():
except KeyError:
die("A database name is required!")
return
else:
DB_DIR = make_path_absolute(str(db_name), CWD)
......@@ -1089,7 +1090,8 @@ def main():
colour_list=colour_list,
circle_scale=circle_scale,
paired_end=paired_end,
alpha=alpha
alpha=alpha,
log_scores=log_scores
)
#
......@@ -1144,7 +1146,8 @@ def main():
phyla=plot_phyla,
name_taxa=name_to_lineage,
colour_list=colour_list,
circle_scale=circle_scale
circle_scale=circle_scale,
log_scores=log_scores
)
results.to_taxonomy(name_to_lineage, taxon_to_rank, tax_results_path)
......@@ -1187,7 +1190,8 @@ def main():
phyla=plot_phyla,
name_taxa=name_to_lineage,
colour_list=colour_list,
circle_scale=circle_scale
circle_scale=circle_scale,
log_scores=log_scores
)
results.draw_results()
......
......@@ -953,7 +953,6 @@ def main(out_dir, genome_paths, phylogeny_path, k, n=None, n_extract=None,
# Import phylogeny.
genome_ids, index = import_phylogeny(phylogeny_path)
# Create LCA matrix.
format_name = expam.sequences.format_name
node_to_index = {node.name: i for i, node in enumerate(index.pool) if i > 0}
lca_matrix = make_lca_matrix(index, node_to_index)
......@@ -964,14 +963,14 @@ def main(out_dir, genome_paths, phylogeny_path, k, n=None, n_extract=None,
# Multiprocessing configuration.
mp_config = {
"name": "expam", # Job system title.
"phases": [ # Two-pass algorithm.
"import", # Import sequences and take union.
"map" # Map kmer to LCA.
"name": "expam", # Job system title.
"phases": [ # Two-pass algorithm.
"import", # Import sequences and take union.
"map" # Map kmer to LCA.
],
"layers": { # Top layer of child processes - extract sequence & kmers.
"layers": { # Top layer of child processes - extract sequence & kmers.
"class": expam.processes.ExtractWorker,
"class_args": { # Positional arguments to init class.
"class_args": { # Positional arguments to init class.
"k": k,
"shm_kmer_params": shm_allocations,
"kmer_ranges": ranges,
......@@ -979,23 +978,23 @@ def main(out_dir, genome_paths, phylogeny_path, k, n=None, n_extract=None,
"out_dir": out_dir,
"logging_dir": logs_dir,
},
"parent": ControlCenter, # Receive work from main process.
"parent": ControlCenter, # Receive work from main process.
"child": {
"class": expam.processes.UnionWorker, # Take union of sets of kmers.
"class_args": { # Each either length n_union or 1.
"class": expam.processes.UnionWorker, # Take union of sets of kmers.
"class_args": { # Each either length n_union or 1.
"main_con": union_cons,
"lca_matrix": lca_matrix,
"pile_size": pile_size,
"logging_dir": logs_dir,
"lock_value": lock_value,
},
"parent": expam.processes.ExtractWorker, # Receive work from ExtractWorkers.
"parent": expam.processes.ExtractWorker, # Receive work from ExtractWorkers.
"child": None, # Bottom of process hierarchy.
"n": n_union, # Number of processes.
"n": n_union, # Number of processes.
},
"n": n_extract,
},
"timeout": TIMEOUT, # Time to block for when checking queues.
"timeout": TIMEOUT, # Time to block for when checking queues.
}
process_network = ExpamProcesses.from_method_dict(main_cons, out_dir, logs_dir, mp_config)
......
import random
from math import floor
from math import floor, log
import json
import os
import traceback
......@@ -768,14 +768,14 @@ class Index:
return list(self.yield_leaves(node_name))
def draw_results(self, file_path, out_dir, skiprows=None, groups=None, cutoff=None, cpm=None, colour_list=None,
name_to_taxon=None, use_phyla=False, keep_zeros=True, use_node_names=True):
name_to_taxon=None, use_phyla=False, keep_zeros=True, use_node_names=True, log_scores=False):
counts = pd.read_csv(file_path, sep='\t', index_col=0, header=0, skiprows=skiprows)
self.draw_tree(out_dir, counts=counts, groups=groups, cutoff=cutoff, cpm=cpm, colour_list=colour_list,
name_to_taxon=name_to_taxon, use_phyla=use_phyla, keep_zeros=keep_zeros,
use_node_names=use_node_names)
use_node_names=use_node_names, log_scores=log_scores)
def draw_tree(self, out_dir, counts=None, groups=None, cutoff=None, cpm=None, colour_list=None, name_to_taxon=None,
use_phyla=False, keep_zeros=True, use_node_names=True):
use_phyla=False, keep_zeros=True, use_node_names=True, log_scores=True):
try:
import ete3.coretype.tree
from ete3 import AttrFace, faces, Tree, TreeStyle, NodeStyle, TextFace
......@@ -787,11 +787,13 @@ class Index:
return
from expam.sequences import format_name
"""
Phylogenetic printing of nodes.
"""
for node in self.pool:
if node.type == 'Branch' and node.name[0] != 'p':
if node.type == 'Branch' and not node.name.startswith('p'):
node.name = 'p' + node.name
"""
......@@ -805,23 +807,42 @@ class Index:
"""
if groups is None:
groups = [(None, (section,)) for section in counts.columns]
else:
# Format group names.
groups = [
(
col,
tuple(
format_name(group_member, remove_comp=True)
for group_member in group
)
)
for col, group in groups
]
# *** Relies on counts being defined.
if counts is not None:
# Combine counts within the group.
for _, group in groups:
if len(group) > 1:
group_name = group[0]
# Conglomerate counts within the group.
counts.loc[:, group_name] = counts[list(group)].sum(axis=1)
counts.drop(labels=list(groups[1:]), axis=1, inplace=True)
sections = counts.columns.tolist()
nodes = counts.index.tolist()
# Remove any groups that weren't specified.
all_groups = counts.columns.tolist()
specified_groups = set(group[0] for _, group in groups)
unspecified_groups = [group for group in all_groups if group not in specified_groups]
counts.drop(labels=unspecified_groups, axis=1, inplace=True)
"""
Employ cutoff.
"""
sections = list(specified_groups)
nodes = counts.index.tolist()
if cutoff is None and cpm is None:
nodes_with_counts = nodes
......@@ -854,8 +875,15 @@ class Index:
Colour generation.
"""
max_vector = tuple(counts.max())
min_vector = tuple(counts.min())
colours = [colour for colour, _ in groups]
colour_generator = ColourGenerator(colours, max_vector, colour_list=colour_list)
colour_generator = ColourGenerator(
colours,
max_vector,
min_vector,
colour_list=colour_list,
log_scores=log_scores
)
"""
Function to render any given node.
......@@ -881,7 +909,7 @@ class Index:
node_counts = tuple(counts.loc[node.name, :])
if any(node_counts):
colour, _ = colour_generator.generate(node_counts)
colour = colour_generator.generate(node_counts)
ns = NodeStyle()
ns['bgcolor'] = colour
......@@ -1000,16 +1028,14 @@ class RandomColour:
class ColourGenerator:
def __init__(self, colours, max_vector, colour_list=None):
def __init__(self, colours, max_vector, min_vector=None, colour_list=None, log_score=False):
"""
:param colours:
:param max_vector:
:param colour_list:
"""
"""
Ensure we have enough colours for plotting.
"""
# Ensure we have enough colours for plotting.
g = RandomColour()
# Ensure we have enough colours for each group.
......@@ -1032,19 +1058,30 @@ class ColourGenerator:
#
self.max_vector = max_vector
self.min_vector = min_vector
self.make_log_score = log_score
if self.make_log_score and self.min_vector is None:
raise AttributeError("Log scores requested but not min vector supplied!")
# Calculate normalisation factors.
self._log_norm_factor = [log(mx / mn) for mx, mn in zip(self.max_vector, self.min_vector)] if self.make_log_score else None
def _calculate_colour(self, score_vector, scorer):
def generate(self, score_vector):
colour = HexColour("#FFFFFF") # White background.
scorer = self._linear_scores if not self.make_log_score else self._log_scores
# Artificial scaling factor.
non_zero_indices = [i for i, score in enumerate(score_vector) if score > 0]
alpha = sum([score_vector[i] for i in non_zero_indices]) / sum([scorer[i] for i in non_zero_indices])
for next_colour, opacity in scorer(score_vector):
colour += next_colour.opaque(opacity)
for i, score in enumerate(score_vector):
if score > 0:
colour += self.colours[i].opaque(score / scorer[i])
return str(colour)
return str(colour), alpha
def _linear_scores(self, vector):
for i, score in enumerate(vector):
if score > 0:
yield self.colours[i], score / self.max_vector[i]
def generate(self, vector):
return self._calculate_colour(vector, self.max_vector)
def _log_scores(self, vector):
for i, score in enumerate(vector):
if score > 0:
yield self.colours[i], log(score / self.min_vector[i]) / self._log_norm_factor
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment