Commit 20df4ab3 authored by Sean Solari's avatar Sean Solari
Browse files

Refactored codes into isolated modules

parent 165c87f7
......@@ -5,7 +5,7 @@ from setuptools.extension import Extension
from Cython.Build import cythonize
import numpy as np
EXPAM_VERSION = (0, 0, 9)
EXPAM_VERSION = (1, 0, 0)
SOURCE = os.path.dirname(os.path.abspath(__file__))
......@@ -16,21 +16,21 @@ with open(os.path.join(SOURCE, "README.md"), mode="r", encoding="utf-8") as f:
# Extension instances for Cython scripts.
extensions = [
Extension(
"map",
sources=["src/expam/c/map.pyx"],
"expam.ext.kmers._build",
sources=["src/expam/ext/kmers/extract.pyx", "src/expam/ext/kmers/kmers.c", "src/expam/ext/kmers/jellyfish.c"],
include_dirs=[np.get_include()],
extra_compile_args=["-std=c99"],
define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")]
),
Extension(
"extract",
sources=["src/expam/c/extract.pyx", "src/expam/c/kmers.c", "src/expam/c/jellyfish.c"],
"expam.ext.map._build",
sources=["src/expam/ext/map/map.pyx"],
include_dirs=[np.get_include()],
extra_compile_args=["-std=c99"],
define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")]
),
Extension(
"sets",
sources=["src/expam/c/sets.pyx", "src/expam/c/mfil.c"],
"expam.ext.sets._build",
sources=["src/expam/ext/sets/sets.pyx", "src/expam/ext/sets/mfil.c"],
include_dirs=[np.get_include()],
extra_compile_args=["-std=c99"],
define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")]
......@@ -82,7 +82,7 @@ setup(
#
# Cython modules.
#
ext_package="expam.c",
ext_package="expam.ext",
ext_modules=cythonize(extensions, language_level="3"),
#
# Make main callable from console.
......
#!/usr/bin/env python3
def main():
...
if __name__ == "__main__":
...
\ No newline at end of file
import expam.classification
import expam.processes
import expam.run
import expam.run
import expam.sequences
import expam.stores
import expam.tree
import gzip
COMPRESSION_EXTNS = ['.tar.gz', '.tar', '.gz']
DEFAULT_MODE = "rb"
DEFAULT_OPENER = open
COMP_PARSE = {
".tar.gz": {"mode": "rb", "opener": gzip.open},
".gz": {"mode": "rb", "opener": gzip.open}
}
from .main import ExpamOptions, clear_logs, CommandGroup, PlotLogs
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
from collections import namedtuple
import os
ENTREZ_EMAIL = "sean.solari@monash.edu"
ENTREZ_TOOL = "expam"
ENTREZ_API = "8e5edfa1413576bca4f48b4a5520f6295308"
_STEP = 500 # NCBI Database searches at a time.
PHY_RESULTS = "phy"
TAX_RESULTS = "tax"
RAW_RESULTS = "raw"
TEMP_RESULTS = "temp"
CLASSIFIED_NAME = "classified.tsv"
SPLIT_NAME = "split.tsv"
PHY_RAW = os.path.join(PHY_RESULTS, RAW_RESULTS)
TAX_RAW = os.path.join(TAX_RESULTS, RAW_RESULTS)
PHY_CLASSIFIED_FILE = os.path.join(PHY_RESULTS, CLASSIFIED_NAME)
PHY_SPLIT_FILE = os.path.join(PHY_RESULTS, SPLIT_NAME)
TAX_CLASSIFIED_FILE = os.path.join(TAX_RESULTS, CLASSIFIED_NAME)
TAX_SPLIT_FILE = os.path.join(TAX_RESULTS, SPLIT_NAME)
ResultsPathConfig = namedtuple(
'ResultsPathConfig',
[
'phy', 'tax',
'temp',
'phy_raw', 'tax_raw',
'phy_classified', 'phy_split',
'tax_classified', 'tax_split'
]
)
import os
from expam.classify import PHY_CLASSIFIED_FILE, PHY_RAW, PHY_RESULTS, PHY_SPLIT_FILE, TAX_CLASSIFIED_FILE, TAX_RAW, TAX_RESULTS, TAX_SPLIT_FILE, TEMP_RESULTS, ResultsPathConfig
from expam.utils import die
def make_results_config(out_path: str) -> ResultsPathConfig:
output_file_locations = {
'phy': os.path.join(out_path, PHY_RESULTS),
'tax': os.path.join(out_path, TAX_RESULTS),
'temp': os.path.join(out_path, TEMP_RESULTS),
'phy_raw': os.path.join(out_path, PHY_RAW),
'tax_raw': os.path.join(out_path, TAX_RAW),
'phy_classified': os.path.join(out_path, PHY_CLASSIFIED_FILE),
'phy_split': os.path.join(out_path, PHY_SPLIT_FILE),
'tax_classified': os.path.join(out_path, TAX_CLASSIFIED_FILE),
'tax_split': os.path.join(out_path, TAX_SPLIT_FILE)
}
return ResultsPathConfig(**output_file_locations)
def load_results_config(out_path: str, create: bool = False) -> ResultsPathConfig:
proposed_config: ResultsPathConfig = make_results_config(out_path)
# Make base results path.
if create:
if not os.path.exists(out_path):
try:
os.mkdir(out_path)
except OSError:
print("Failed to make results path %s." % out_path)
create_results(proposed_config)
if not validate_results_configuration(proposed_config, check_taxonomy=False):
die("Results path does not exist!")
return proposed_config
def create_results(config: ResultsPathConfig):
for path_field in ('phy', 'tax', 'phy_raw', 'tax_raw'):
path = getattr(config, path_field)
if not os.path.exists(path):
os.mkdir(path)
def validate_results_configuration(config: ResultsPathConfig, check_taxonomy: bool = True):
phy_files = (config.phy, config.phy_classified, config.phy_split)
tax_files = (config.tax, config.tax_classified, config.tax_split)
for phy_file in phy_files:
if not os.path.exists(phy_file):
return False
if check_taxonomy:
for tax_file in tax_files:
if not os.path.exists(tax_file):
return False
return True
def validate_classification_results(results_dir: str):
if not os.path.exists(results_dir):
die("Could not find results %s!" % results_dir)
results_config: ResultsPathConfig = make_results_config(results_dir)
if not (os.path.exists(results_config.phy_classified) or os.path.exists(results_config.phy_split)):
raise Exception("Path does not look like expam results folder!")
from expam.process.manager import ControlCenter
class ExpamClassifierProcesses(ControlCenter):
@classmethod
def from_method_dict(cls, logging_dir, config):
base_center = super(ExpamClassifierProcesses, cls).from_dict(logging_dir, config)
base_arguments = {
"group_name": "group_name",
"workers": "workers",
"child_statuses": "_child_statuses",
"phases": "phases",
"phase_queues": "phase_queues",
"child_queues": "child_queues",
"children": "_children",
"processors": "_processors",
"transitions": "_transitions",
"timeout": "_timeout",
}
base_attributes = {
attr: getattr(base_center, attr_reference)
for attr, attr_reference in base_arguments.items()
}
# Create child class from this instance.
control_center = ExpamClassifierProcesses(logging_dir=logging_dir, **base_attributes)
control_center.set_methods(
{
"classify": {
"processor": None,
"transition": None
}
}
)
return control_center
import re
import time
import requests
from expam.classify import _STEP, ENTREZ_API, ENTREZ_EMAIL, ENTREZ_TOOL
from expam.database import FileLocationConfig
from expam.database.config import validate_taxonomy_files
from expam.utils import yield_csv
class TaxonomyNCBI:
def __init__(self, file_config: FileLocationConfig) -> None:
self.config = file_config
if not validate_taxonomy_files(file_config):
raise OSError("Taxonomy files not located!")
def load_taxonomy_map(self, convert_to_name=True):
# Create map from scientific name --> (taxid, rank).
taxon_data = {}
for data in yield_csv(self.config.taxon_rank):
taxon_data[",".join(data[0:-2])] = tuple(data[-2:])
# Create map from tax_id --> lineage (tuple).
tax_id_to_lineage = {}
for data in yield_csv(self.config.taxid_lineage):
tax_id_to_lineage[data[0]] = tuple(data[1:])
if not convert_to_name:
return tax_id_to_lineage, taxon_data
# Create map from name --> lineage (tuple).
name_to_lineage = {}
for data in yield_csv(self.config.accession_id):
name_to_lineage[data[0]] = tax_id_to_lineage[data[2]]
return name_to_lineage, taxon_data
def load_sequence_map(self):
return list(yield_csv(self.config.accession_id))
def load_taxid_lineage_map(self):
return list(yield_csv(self.config.taxid_lineage))
def load_rank_map(self):
name_to_rank = {}
for data in yield_csv(self.config.taxon_rank):
if len(data) > 1:
name_to_rank[data[0]] = ",".join(data[1:])
return name_to_rank
def accession_to_taxonomy(self):
"""
Map accession IDs to taxonomic labels.
:sequence_ids: List - (sequence_id, accession_id, taxon_id)
:taxon_ranks: List - (taxon_id, taxonomic ranks)
:taxa_to_rank: Dict - taxon --> rank
"""
def tuples_to_disk(lst):
return "\n".join([",".join(item) for item in lst])
def dict_to_disk(dct):
return "\n".join([",".join((key, value)) for key, value in dct.items()])
sequence_ids = self.load_sequence_map()
taxon_ranks = self.load_taxid_lineage_map()
taxa_to_rank = self.load_rank_map()
# Collect taxon ids for unknown organisms.
accessions_to_be_mapped = []
taxa_to_be_collected = []
for (sequence_id, accession_id, taxon_id) in sequence_ids:
if taxon_id == "None":
accessions_to_be_mapped.append(accession_id)
else:
taxa_to_be_collected.append(taxon_id)
if accessions_to_be_mapped:
requestor = EntrezRequestor()
accession_to_tax = requestor.request_tax_ids("nuccore", accessions_to_be_mapped)
print("Received %d response(s) for ESummary TaxID request!"
% len(accession_to_tax))
for i in range(len(sequence_ids)):
if sequence_ids[i][1] in accession_to_tax:
sequence_ids[i][2] = accession_to_tax[sequence_ids[i][1]]
taxa_to_be_collected.append(sequence_ids[i][2])
# Collect taxonomic lineages for taxa.
current_taxa = {taxa[0] for taxa in taxon_ranks}
taxa_to_be_collected = { # Set so that we collect unique values.
taxon_id
for taxon_id in taxa_to_be_collected
if taxon_id not in current_taxa
}
if taxa_to_be_collected:
taxid_to_taxon, taxon_to_rank = requestor.request_labels("taxonomy", "xml", list(taxa_to_be_collected))
print("Received %d response(s) for EFetch Taxon request!"
% len(taxid_to_taxon))
taxon_ranks.extend(taxid_to_taxon)
taxa_to_rank.update(taxon_to_rank)
# Save update maps to disk.
with open(self.config.accession_id, "w") as f:
f.write(tuples_to_disk(sequence_ids))
with open(self.config.taxid_lineage, "w") as f:
f.write(tuples_to_disk(taxon_ranks))
print("Taxonomic lineages written to %s!" % self.config.taxid_lineage)
with open(self.config.taxon_rank, "w") as f:
f.write(dict_to_disk(taxa_to_rank))
print("Taxonomic ranks written to %s!" % self.config.taxon_rank)
class EntrezRequestor:
def __init__(self, entrez_tool: str = None, entrez_email: str = None, api_key: str = None) -> None:
self.entrez_tool = ENTREZ_TOOL if entrez_tool is None else entrez_tool
self.entrez_email = ENTREZ_EMAIL if entrez_email is None else entrez_email
self.api_key = ENTREZ_API if api_key is None else api_key
def request_tax_ids(self, db, id_list):
POST_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
taxids, upto = {}, 0
PARAMS = {
"tool": self.entrez_tool,
"email": self.entrez_email,
"api_key": self.api_key,
"db": db,
}
while upto < len(id_list):
next_requests = id_list[upto:upto + _STEP]
print("Posting %d UIDs to NCBI Entrez %s."
% (len(next_requests), db))
PARAMS["id"] = ",".join(next_requests)
esummary_request = requests.post(
url=POST_URL,
data=PARAMS
)
# Parse TaxonIDs from raw results.
accn_id = tax_id = None
for line in esummary_request.text.split("\n"):
if "<DocSum>" in line: # Start new record.
accn_id = tax_id = None
elif 'AccessionVersion' in line:
accn_id = self.parse_id(line)
elif 'TaxId' in line:
tax_id = self.parse_tax_id(line)
elif "</DocSum>" in line: # Only save complete records.
if accn_id is not None and tax_id is not None:
taxids[accn_id] = tax_id
upto += _STEP
time.sleep(1.0) # Allow server time to breath.
return taxids
@staticmethod
def parse_id(string):
new_id = re.findall(r'\<Item Name\="AccessionVersion" Type\="String"\>(.*?)\<\/Item\>', string)
if not new_id:
raise ValueError("No taxids found!")
else:
return new_id[0]
@staticmethod
def parse_tax_id(string):
taxids = re.findall(r'\<Item Name\="TaxId" Type\="Integer"\>(.*?)\<\/Item\>', string)
if not taxids:
raise ValueError("No taxids found!")
else:
return taxids[0]
def request_labels(self, db, retmode, id_list):
POST_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
taxon_labels, ranks, upto = [], {}, 0
PARAMS = {
"tool": self.entrez_tool,
"email": self.entrez_email,
"api_key": self.api_key,
"db": db,
"retmode": retmode,
}
while upto < len(id_list):
next_requests = id_list[upto:upto + _STEP]
print("Posting %d UIDs to NCBI Entrez %s."
% (len(next_requests), db))
PARAMS["id"] = ",".join(next_requests)
efetch_request = requests.post(
url=POST_URL,
data=PARAMS
)
# Parse taxonomic labels from raw results.
tax_id = taxa = rank = name = None
sub_tax_id = sub_rank = sub_name = None
collect = True
for line in efetch_request.text.split("\n"):
if "<Taxon>" in line and collect: # Start new record.
tax_id = taxa = name = rank = None
elif "<Taxon>" in line and not collect:
sub_tax_id = sub_rank = sub_name = None
elif "<TaxId>" in line and collect:
tax_id = self.parse_single_tax_id(line)
elif "<TaxId>" in line and not collect:
sub_tax_id = self.parse_single_tax_id(line)
elif "<Lineage>" in line and collect:
taxa = self.parse_lineage(line).replace(",", "")
elif "<Rank>" in line and not collect:
if sub_name == "cellular organisms":
sub_rank = "top"
else:
sub_rank = self.parse_rank(line)
elif "<Rank>" in line and collect:
if name == "cellular organisms":
rank = "top"
else:
rank = self.parse_rank(line)
elif "<ScientificName>" in line and not collect:
sub_name = self.parse_name(line)
elif "<ScientificName>" in line and collect:
name = self.parse_name(line)
elif "<LineageEx>" in line:
collect = False
elif "</Taxon>" in line and not collect:
ranks[sub_name] = sub_tax_id + "," + sub_rank
elif "</LineageEx>" in line:
collect = True
elif "</Taxon>" in line and collect:
if tax_id is not None and taxa is not None:
lineage = taxa.strip().split("; ")
if name not in lineage and name is not None:
lineage += [name]
taxon_labels.append([tax_id, ",".join(lineage)])
ranks[name] = tax_id + ',' + rank
upto += _STEP
time.sleep(1.0) # Allow server time to breath.
return taxon_labels, ranks
@staticmethod
def parse_single_tax_id(string):
taxids = re.findall(r'\<TaxId\>(.*?)\<\/TaxId\>', string)
if not taxids:
raise ValueError("No taxids found!")
else:
return taxids[0]
@staticmethod
def parse_lineage(string):
lineage = re.findall(r'\<Lineage\>(.*?)\<\/Lineage\>', string)
if not lineage:
raise ValueError("No lineages found!")
else:
return lineage[0]
@staticmethod
def parse_rank(string):
rank = re.findall(r'\<Rank\>(.*?)\<\/Rank\>', string)
if not rank:
raise ValueError("No rank found!")
else:
return rank[0]
@staticmethod
def parse_name(string):
name = re.findall(r'\<ScientificName\>(.*?)\<\/ScientificName\>', string)
if not name:
raise ValueError("No rank found!")
else:
name = re.sub(r"[,]", "", name[0])
return name
This diff is collapsed.
import os
from expam.main import CommandGroup, ExpamOptions, clear_logs
from expam.database import FileLocationConfig
from expam.database.config import JSONConfig, create_database, make_database_config, validate_database_file_configuration
from expam.logger import Timer
from expam.utils import die, ls, make_path_absolute
class BuildCommand(CommandGroup):
commands: set[str] = {
'quickrun', 'default_db', 'create',
'build', 'print', 'add', 'remove', 'set'
}
def __init__(
self, config: FileLocationConfig,
k: int, n: int, s: int, phylogeny_path: str, pile_size: int,
files: list[str], group: str,
first_n: int,
make_plot: bool = False
) -> None:
super().__init__()
self.config: FileLocationConfig = config
self.k = k
self.n = n
self.s = s
self.phylogeny_path = phylogeny_path
self.pile_size = pile_size
self.files = files
self.group = group
self.first_n = first_n