Commit e0779df2 authored by Sean Solari's avatar Sean Solari
Browse files

Bug fixes for cython compilation

parent 20df4ab3
......@@ -15,26 +15,38 @@ with open(os.path.join(SOURCE, "README.md"), mode="r", encoding="utf-8") as f:
# Extension instances for Cython scripts.
extensions = [
Extension(
"expam.ext.kmers._build",
sources=["src/expam/ext/kmers/extract.pyx", "src/expam/ext/kmers/kmers.c", "src/expam/ext/kmers/jellyfish.c"],
include_dirs=[np.get_include()],
extra_compile_args=["-std=c99"],
define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")]
*cythonize(
Extension(
"kmers._build.kmers",
sources=["src/expam/ext/kmers/extract.pyx", "src/expam/ext/kmers/kmers.c", "src/expam/ext/kmers/jellyfish.c"],
include_dirs=[np.get_include()],
extra_compile_args=["-std=c99"],
define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")]
),
language_level="3",
build_dir="src/expam/ext/kmers/_build"
),
Extension(
"expam.ext.map._build",
sources=["src/expam/ext/map/map.pyx"],
include_dirs=[np.get_include()],
define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")]
),
Extension(
"expam.ext.sets._build",
sources=["src/expam/ext/sets/sets.pyx", "src/expam/ext/sets/mfil.c"],
include_dirs=[np.get_include()],
extra_compile_args=["-std=c99"],
define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")]
*cythonize(
Extension(
"map._build.map",
sources=["src/expam/ext/map/map.pyx"],
include_dirs=[np.get_include()],
define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")]
),
language_level="3",
build_dir="src/expam/ext/map/_build"
),
*cythonize(
Extension(
"sets._build.sets",
sources=["src/expam/ext/sets/sets.pyx", "src/expam/ext/sets/mfil.c"],
include_dirs=[np.get_include()],
extra_compile_args=["-std=c99"],
define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")]
),
language_level="3",
build_dir="src/expam/ext/sets/_build"
)
]
setup(
......@@ -83,14 +95,14 @@ setup(
# Cython modules.
#
ext_package="expam.ext",
ext_modules=cythonize(extensions, language_level="3"),
ext_modules=extensions,
#
# Make main callable from console.
#
entry_points={
"console_scripts": [
"expam=expam.cli:main",
"expam_limit=expam.sandbox:main"
"expam=expam.main:main",
"expam_limit=expam.sandbox.main:main"
],
},
#
......
from ._build import \
from ._build.kmers import \
reverse_complement, \
get_raw_kmers, \
bytes_intersect, \
......
This diff is collapsed.
/*
This file is part of Jellyfish.
This work is dual-licensed under 3-Clause BSD License.
Original code from https://github.com/gmarcais/Jellyfish.
Modified on Friday 17th September, 2021.
*/
#include <stdlib.h>
#include <stdint.h>
static uint64_t word_reverse_complement(uint64_t w, int k) {
w = ((w >> 2 ) & 0x3333333333333333) | ((w & 0x3333333333333333) << 2 );
w = ((w >> 4 ) & 0x0F0F0F0F0F0F0F0F) | ((w & 0x0F0F0F0F0F0F0F0F) << 4 );
w = ((w >> 8 ) & 0x00FF00FF00FF00FF) | ((w & 0x00FF00FF00FF00FF) << 8 );
w = ((w >> 16) & 0x0000FFFF0000FFFF) | ((w & 0x0000FFFF0000FFFF) << 16);
w = ( w >> 32 ) | ( w << 32);
w = ~w; // Take reverse complement.
w >>= 2 * (32 - k); // Account for k-values less than 32.
return w;
}
// kmers.c
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include <string.h>
static void shift_kmer(uint64_t *kmer, const unsigned char base, int k) {
uint64_t next_bits, mask;
int chunks, next_chunk;
int i;
chunks = k / 32 + !! (k % 32 != 0);
mask = ~(3ULL);
for (i=0; i < chunks; ++i) {
kmer[i] <<= 2;
next_chunk = (i == chunks - 2) ? k % 32 : 32;
if (i < chunks - 1)
// Hard-coded for 64-bit chunks.
next_bits = kmer[i+1] >> (2*(next_chunk-1));
else {
switch(base) {
case 'A':
next_bits = 0ULL;
break;
case 'C':
next_bits = 1ULL;
break;
case 'G':
next_bits = 2ULL;
break;
case 'T':
next_bits = 3ULL;
break;
default:
next_bits = 0ULL;
break;
}
/* Zero out non-2k bits */
if (k % 32 != 0)
mask &= ~(3ULL << (2*(k % 32)));
}
kmer[i] = (kmer[i] & mask) | next_bits;
}
}
static void update_kmer(uint64_t *kmer, const unsigned char *sequence, uint64_t index, int k) {
uint64_t kmer_value, next_bit;
int chunks, current_chunk;
int i, j;
chunks = k / 32 + !! (k % 32 != 0);
for (i=0; i < chunks; ++i){
kmer_value = 0;
if (i == chunks - 1)
current_chunk = k % 32;
else
current_chunk = 32;
for (j=0; j < current_chunk; ++j) {
switch(sequence[index + i*32 + j]) {
case 'A':
next_bit = 0ULL;
break;
case 'C':
next_bit = 1ULL << (2 * (current_chunk-(j+1)) );
break;
case 'G':
next_bit = 2ULL << (2 * (current_chunk-(j+1)) );
break;
case 'T':
next_bit = 3ULL << (2 * (current_chunk-(j+1)) );
break;
default:
next_bit = 0ULL;
break;
}
kmer_value |= next_bit;
}
kmer[i] = kmer_value;
}
}
static inline uint64_t* init_kmer(const unsigned char *sequence, uint64_t index, int k) {
int chunks;
uint64_t *kmer;
chunks = k / 32 + !!(k % 32 != 0);
kmer = (uint64_t *)malloc(sizeof(uint64_t) * chunks);
update_kmer(kmer, sequence, index, k);
return kmer;
}
static uint64_t find_next_kmer(const unsigned char *sequence, uint64_t index, int k) {
bool can_start;
uint64_t seq_length, max_index;
unsigned char l;
can_start = true; // We are moving onto the next index.
seq_length = max_index = strlen(sequence);
max_index -= k - 1;
while (index <= max_index)
{
for (int i=0; i<k; ++i)
{
l = sequence[index + i];
if ((l!='A') && (l!='C') && (l!='G') && (l!='T'))
{
can_start = false;
break;
}
}
if (can_start)
break;
else {
can_start = true;
++index;
}
}
if (index < max_index)
return index;
return seq_length;
};
static uint64_t raw_kmers(const unsigned char *sequence, int k, uint64_t *arr) {
uint64_t max_index, sequence_length;
uint64_t i, j, x;
uint64_t *kmer;
unsigned char base;
uint64_t chunks;
max_index = sequence_length = strlen(sequence);
max_index -= k-1;
i = x = 0;
chunks = (uint64_t ) (k / 32) + !! (k % 32 != 0);
kmer = (uint64_t *)malloc(chunks * sizeof(uint64_t));
if (!kmer)
return 0;
/* Initialise kmer */
i = find_next_kmer(sequence, i, k);
if (i != sequence_length) {
kmer = init_kmer(sequence, i, k);
for (j=0; j<chunks; j++)
*(arr + (j + 1)) = kmer[j];
*(arr + 0) = i;
++x, ++i;
}
/* Filling loop */
while (i < max_index)
{
/* Find next kmer */
base = sequence[i + k - 1];
if ((base!='A') && (base!='C') && (base!='G') && (base!='T'))
{
i = find_next_kmer(sequence, i, k);
if (i != sequence_length)
update_kmer(kmer, sequence, i, k);
else
// This will quit filling loop.
continue;
}
else
shift_kmer(kmer, base, k);
/* Write this kmer to array and its corresponding index */
for (j=0; j<chunks; ++j)
*(arr + x*(chunks + 1) + (j + 1)) = *(kmer + j);
*(arr + x*(chunks + 1)) = i;
++x, ++i;
}
free(kmer);
return x;
}
static void fill_kmers(const unsigned char *sequence, int k, uint64_t n, uint64_t *arr) {
uint64_t *kmer;
uint64_t i, j, chunks;
i = j = 0;
chunks = k / 32 + !! (k % 32 != 0);
kmer = (uint64_t *)malloc(chunks * sizeof(uint64_t));
if (!kmer)
return;
for (i=0; i<n; ++i) {
update_kmer(kmer, sequence, *(arr + i*(chunks + 1)), k);
for (j=0; j<chunks; ++j)
*(arr + i*(chunks + 1) + (j + 1)) = *(kmer + j);
}
free(kmer);
}
from ._build import \
from ._build.map import \
binarysearch, \
binarysearch_put, \
binarysearch_get, \
......
This diff is collapsed.
from ._build import \
from ._build.sets import \
filler, \
disjoint
\ No newline at end of file
// mfil.c
#include <stdbool.h>
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <inttypes.h>
typedef struct {
uint64_t *array;
int64_t index;
} node_t;
typedef struct {
node_t *nodes; // array of nodes.
uint64_t k; // # leaves.
uint64_t chunks; // wrt. node arrays.
uint64_t size; // # nodes, including extra at 0.
} tree_t;
typedef struct {
uint64_t *array;
int64_t index;
} Output;
/* Forward declarations. */
// Array representation of binary tree.
static void init_tree(tree_t *tree, uint64_t k, uint64_t chunks, uint64_t **arr_ptrs, int64_t *indices);
static void set_leaves(tree_t *tree, uint64_t **arr_ptrs, int64_t *indices);
static void fill_tree(tree_t *tree);
static node_t traverse_and_compare(tree_t *tree, int64_t node_index);
static void write_to_node(tree_t *tree, int64_t dest, node_t target);
static node_t get_node(tree_t *tree, int64_t node_index);
static node_t play_game(tree_t *tree, int64_t node_index, node_t one, node_t two);
static bool compare(node_t one, node_t two, uint64_t chunks);
// K-way merging.
static void k_way_merge(uint64_t **arr_ptrs, int64_t *indices, uint64_t n, uint64_t k, uint64_t chunks);
static bool has_more(tree_t *tree);
static void write_to_out(tree_t *tree, Output *out);
static node_t get_next_player(tree_t *tree);
static int64_t get_next_index(tree_t *tree);
static void next_game(tree_t *tree, node_t next_player, int64_t node_index);
// Linear culling.
static int64_t multidisjoint(int n_arrays, int n_columns, uint64_t **arr_ptrs, int64_t *sizes);
/** Array representation of binary tree.
* @param k Number of arrays to be traversed.
* @param arr_ptrs Pointer on each array to be traversed.
* @param indices Length of each array to be traversed.
* @param chunks Breadth of each array.
* @param return Binary tournament loser tree.
*
* Given the tree is indexed using integers, the parent node of any given
* index (i) in the tree is (i/2). There are (2k-1) nodes in the binary tree,
* but account for extra node at top to store winner.
*/
static void init_tree(tree_t *tree, uint64_t k, uint64_t chunks, uint64_t **arr_ptrs, int64_t *indices) {
tree->k = k;
tree->size = 2 * k;
tree->chunks = chunks;
tree->nodes = malloc(sizeof(node_t) * tree->size);
/* Set the leaves for game preparation. */
set_leaves(tree, arr_ptrs, indices);
/* Propagate games upward. */
fill_tree(tree);
}
static void set_leaves(tree_t *tree, uint64_t **arr_ptrs, int64_t *indices) {
for (uint64_t i=tree->k; i<tree->size; ++i) {
/* Pointer on each numpy array/memoryview. */
tree->nodes[i].array = arr_ptrs[i-tree->k];
tree->nodes[i].index = indices[i-tree->k];
}
}
static void fill_tree(tree_t *tree) {
node_t winner;
winner = traverse_and_compare(tree, 1);
write_to_node(tree, 0, winner);
}
static node_t traverse_and_compare(tree_t *tree, int64_t node_index) {
node_t one, two, winner;
if (node_index >= tree->k)
return get_node(tree, node_index);
/* First fill children recursively. */
one = traverse_and_compare(tree, 2 * node_index);
two = traverse_and_compare(tree, (2 * node_index) + 1);
/* Compare my children; write the loser and carry the winner. */
winner = play_game(tree, node_index, one, two);
return winner;
}
static void write_to_node(tree_t *tree, int64_t dest, node_t target) {
tree->nodes[dest].array = target.array;
tree->nodes[dest].index = target.index;
}
static node_t get_node(tree_t *tree, int64_t node_index) {
node_t node;
if (node_index < tree->k)
return tree->nodes[node_index];
else if (tree->nodes[node_index].index < 0) {
node.array = NULL;
node.index = -1;
return node;
} else {
node.array = tree->nodes[node_index].array + (tree->nodes[node_index].index * tree->chunks);
node.index = node_index;
return node;
}
}
static node_t play_game(tree_t *tree, int64_t node_index, node_t one, node_t two) {
/* Write the loser to that node, and return the winner. */
if (one.index < 0) {
write_to_node(tree, node_index, one);
return two;
} else if (two.index < 0) {
write_to_node(tree, node_index, two);
return one;
} else if (compare(one, two, tree->chunks)) {
write_to_node(tree, node_index, two);
return one;
} else {
write_to_node(tree, node_index, one);
return two;
}
}
static bool compare(node_t one, node_t two, uint64_t chunks) {
/* Look for larger kmer to win. */
for (uint64_t x=0; x<chunks; ++x) {
if (one.array[x] > two.array[x])
return true;
else if (two.array[x] > one.array[x])
return false;
else
/* This chunk equal, look to next chunk. */
continue;
}
return false;
}
/** K-way merge (Knuth, 1998)
* @param arr_ptrs Pointer on each array to be merged.
* @param indices Array of lengths of each array.
* @param n Size of final array (i.e. sum(indices)).
* @param k Number of arrays to be merged.
* @param chunks Breadth of each array to be merged.
* @return None Merge done in-place on first array.
*
* Merge k sorted arrays, whose values are distinct with respect to the
* other arrays, into the first array. This first array has been extended
* with 0 values to a size n, but the portion that still contains values is
* stored in the first element of indices.
*/
static void k_way_merge(uint64_t **arr_ptrs, int64_t *indices, uint64_t n, uint64_t k, uint64_t chunks) {
tree_t tree;
node_t next_player;
uint64_t next_index;
Output out;
/* Set output. */
out.array = arr_ptrs[0];
out.index = n - 1;
/* Start competition tree. */
init_tree(&tree, k, chunks, arr_ptrs, indices);
/* Fill arrays. */
while (has_more(&tree)) {
/* Write current winner to output. */
write_to_out(&tree, &out);
/* Start next game. */
next_index = get_next_index(&tree);
next_player = get_next_player(&tree);
next_game(&tree, next_player, next_index);
}
/* Free malloc tree nodes. */
free(tree.nodes);
}
static bool has_more(tree_t *tree) {
if (tree->nodes[0].index < 0)
return false;
return true;
}
static void write_to_out(tree_t *tree, Output *out) {
for (uint64_t j=0; j<tree->chunks; ++j)
*(out->array + (out->index * tree->chunks) + j) = tree->nodes[0].array[j];
out->index--;
}
static node_t get_next_player(tree_t *tree) {
int64_t leaf_index;
leaf_index = tree->nodes[0].index;
tree->nodes[leaf_index].index--;
return get_node(tree, leaf_index);
}
static int64_t get_next_index(tree_t *tree) {
return tree->nodes[0].index / 2;
}
static void next_game(tree_t *tree, node_t next_player, int64_t node_index) {
node_t node;