Usage as Python library - MrTomRod/scoary-2 GitHub Wiki

Python bindings to the pairwise comparisons algorithm, as described in Read, 1995, Maddison, 2000 and Brynildsrud, 2016.

Simple pair picking

from pprint import pprint
from scoary import ScoaryTree, pick_single, print_tree

tree = [['isolate1', 'isolate2'], [['isolate3', 'isolate4'], ['isolate5', 'isolate6']]]

label_to_trait_a = {
    'isolate1': True,
    'isolate2': False,
    'isolate3': True,
    'isolate4': False,
    'isolate5': True,
    'isolate6': False,
}

label_to_trait_b = {
    'isolate1': True,
    'isolate2': False,
    'isolate3': True,
    'isolate4': False,
    'isolate5': True,
    'isolate6': False,
}

print_tree(
    ScoaryTree.from_list(tree),
    label_to_trait_a, label_to_trait_b
)
#       /-11_isolate1
#    /-|
#   |   \-00_isolate2
#   |
# --|      /-11_isolate3
#   |   /-|
#   |  |   \-00_isolate4
#    \-|
#      |   /-11_isolate5
#       \-|
#          \-00_isolate6

result = pick_single(tree, label_to_trait_a, label_to_trait_b, calc_pvals=True)
pprint(result)
# {'best_fisher_p': 0.25,
#  'max_contrasting_pairs': 3,
#  'max_opposing_pairs': 0,
#  'max_supporting_pairs': 3,
#  'worst_pval': 0.25}

Parallel pair picking

This takes advantage of Numba optimizations.

import pandas as pd
from scoary import pick

tree = [['isolate1', 'isolate2'], ['isolate3', 'isolate4']]

# e.g. phenotype
label_to_trait_a = {
    'isolate1': True,
    'isolate2': False,
    'isolate3': False,
    'isolate4': True,
}

# e.g. presence/absence of genes
trait_b_df = pd.DataFrame(
    columns=['isolate1', 'isolate2', 'isolate3', 'isolate4'],
    data=[
        [True, True, False, False],  # gene 1
        [True, False, True, False],  # gene 2
        [True, False, False, True],  # ...
        [False, True, True, False],
        [False, True, False, True],
        [False, True, False, True],
        [False, True, False, True],
        [False, True, False, True],
    ]
)

max_contr, max_suppo, max_oppos, best, worst = pick(
    tree=tree,
    label_to_trait_a=label_to_trait_a,
    trait_b_df=trait_b_df,
    calc_pvals=True
)

print(f'{max_contr=}\n{max_suppo=}\n{max_oppos=}\n{best=}\n{worst=}')
# max_contr=array([1, 2, 2, 2, 2, 2, 2, 2])
# max_suppo=array([1, 1, 2, 0, 1, 1, 1, 1])
# max_oppos=array([1, 1, 0, 2, 1, 1, 1, 1])
# best=array([1. , 1. , 0.5, 0.5, 1. , 1. , 1. , 1. ])
# worst=array([1. , 1. , 0.5, 0.5, 1. , 1. , 1. , 1. ])
⚠️ **GitHub.com Fallback** ⚠️