5. Directional

Since Cluster and Adjacency often led to an underestimated number and an overestimated number of unique UMIs, the Directional method was developed to seek for a more balanced estimation between somewhat of the two extremes. Its deduplication process can coarsely be described as a directed edge-visiting strategy, that is, merging node B by node A if the count of node A is at least two-fold greater than that of node B. The Directional method in the UMI-tools suite is reported to gain the highest accuracy in identifying PCR duplicates¹.

Similar to the Adjacency method, Directional also draws on the information about the count of UMIs. In the example graph, the counts of the 7 unique UMIs are given in node_val_sorted.

node_val_sorted = pd.Series({
    'A': 120,
    'D': 90,
    'E': 50,
    'G': 5,
    'B': 2,
    'C': 2,
    'F': 1,
})

Python

import umiche as uc

ccs = uc.dedup.cluster(
    graph=graph_adj,
    method='deque',
)

dedup_res = directional(
    connected_components=ccs,
    df_umi_uniq_val_cnt=node_val_sorted,
    graph_adj=graph_adj,
    verbose=True,
)
dedup_count = dedup_res['count']
dedup_clusters = dedup_res['clusters']
print("deduplicated count:\n{}".format(dedup_count))
print("deduplicated clusters:\n{}".format(dedup_clusters))
dedup_clusters_dc = decompose(dedup_clusters)
print("deduplicated clusters decomposed:\n{}".format(dedup_clusters_dc))

console

30/07/2024 15:52:33 logger: ======>visited UMI nodes: {'A'}
30/07/2024 15:52:33 logger: {'A'}
30/07/2024 15:52:33 logger: =========>the neighbor: B
30/07/2024 15:52:33 logger: ======>visited UMI nodes: {'A', 'B'}
30/07/2024 15:52:33 logger: {'A', 'B'}
30/07/2024 15:52:33 logger: =========>the neighbor: A
30/07/2024 15:52:33 logger: =========>the neighbor: C
30/07/2024 15:52:33 logger: =========>the neighbor: C
30/07/2024 15:52:33 logger: ======>visited UMI nodes: {'A', 'B', 'C'}
30/07/2024 15:52:33 logger: {'A', 'B', 'C'}
30/07/2024 15:52:33 logger: =========>the neighbor: A
30/07/2024 15:52:33 logger: =========>the neighbor: B
30/07/2024 15:52:33 logger: =========>the neighbor: D
30/07/2024 15:52:33 logger: remaining: {'D', 'F', 'G', 'E'}
30/07/2024 15:52:33 logger: disapproval [['B', 'C'], ['A', 'D']]
30/07/2024 15:52:33 logger: ======>visited UMI nodes: {'D'}
30/07/2024 15:52:33 logger: {'D'}
30/07/2024 15:52:33 logger: =========>the neighbor: A
30/07/2024 15:52:33 logger: =========>the neighbor: E
30/07/2024 15:52:33 logger: =========>the neighbor: F
30/07/2024 15:52:33 logger: ======>visited UMI nodes: {'D', 'F'}
30/07/2024 15:52:33 logger: {'D', 'F'}
30/07/2024 15:52:33 logger: =========>the neighbor: D
30/07/2024 15:52:33 logger: =========>the neighbor: G
30/07/2024 15:52:33 logger: remaining: {'G', 'E'}
30/07/2024 15:52:33 logger: disapproval [['D', 'E'], ['F', 'G']]
30/07/2024 15:52:33 logger: ======>visited UMI nodes: {'E'}
30/07/2024 15:52:33 logger: {'E'}
30/07/2024 15:52:33 logger: =========>the neighbor: D
30/07/2024 15:52:33 logger: =========>the neighbor: G
30/07/2024 15:52:33 logger: ======>visited UMI nodes: {'G', 'E'}
30/07/2024 15:52:33 logger: {'G', 'E'}
30/07/2024 15:52:33 logger: =========>the neighbor: E
30/07/2024 15:52:33 logger: =========>the neighbor: F
30/07/2024 15:52:33 logger: remaining: set()
30/07/2024 15:52:33 logger: disapproval []
deduplicated count:
3
deduplicated clusters:
{'cc_0': {'node_A': ['A', 'B', 'C'], 'node_D': ['D', 'F'], 'node_E': ['G', 'E']}}
deduplicated clusters decomposed:
{0: ['A', 'B', 'C'], 1: ['D', 'F'], 2: ['G', 'E']}

Deduplicated UMI count

There are 3 connected subcomponents (node_A, node_D, and node_E) in connected component cc_0 and therefore, the deduplicated count of UMIs from 7 unique UMIs is 3 at the single locus.

In addition, if we want to explore if merged UMIs are of the same or different origin, we can also output the statistics.

Python

# the same orgin
print(dedup_res['apv'])

# the different origin
print(dedup_res['disapv'])

console

{'cc_0': {'node_A': [['A', 'B'], ['A', 'C']], 'node_D': [['D', 'F']], 'node_E': [['E', 'G']]}}
{'cc_0': {'node_A': [['B', 'C'], ['A', 'D']], 'node_D': [['D', 'E'], ['F', 'G']], 'node_E': []}}

We implemented the Directional method in UMIche.

Python

def directional(
    connected_components,
    df_umi_uniq_val_cnt,
    graph_adj,
):
    cc_sub_cnt = []
    cc_subs = {}
    cc_apvs = {}
    cc_disapvs = {}
    for i, cc in connected_components.items():

        cc_sub, apv_node_nbr, disapv_node_nbr = umi_tools_(
            df_umi_uniq_val_cnt=df_umi_uniq_val_cnt,
            cc=cc,
            graph_adj=graph_adj,
        )
        cc_sub_cnt.append(len(cc_sub))
        cc_subs['cc_' + str(i)] = cc_sub
        cc_apvs['cc_' + str(i)] = apv_node_nbr
        cc_disapvs['cc_' + str(i)] = disapv_node_nbr
    # print(sum(cc_sub_cnt))
    # print(cc_subs)
    # print(cc_apvs)
    # print(cc_disapvs)
    if self.heterogeneity:
        return (
            sum(cc_sub_cnt),
            cc_subs,
            cc_apvs,
            cc_disapvs,
        )
    else:
        return {
            "count": sum(cc_sub_cnt),
            "clusters": cc_subs,
            "apv": cc_apvs,
            "disapv": cc_disapvs,
        }

def directional_search(
    df_umi_uniq_val_cnt,
    cc,
    graph_adj,
):
    cc_node_sorted = df_umi_uniq_val_cnt.loc[df_umi_uniq_val_cnt.index.isin(cc)].sort_values(ascending=False).to_dict()
    ### @@ cc_umi_sorted
    # {'A': 456, 'E': 90, 'D': 72, 'B': 2, 'C': 2, 'F': 1}
    nodes = [*cc_node_sorted.keys()]
    # print(nodes)
    ### @@ cc_sorted
    # ['A', 'E', 'D', 'B', 'C', 'F']
    node_cp = nodes.copy()
    node_set_remaining = set(node_cp)
    ### @@ node_set_remaining
    # {'C', 'F', 'E', 'B', 'D', 'A'}
    cc_sub = {}
    apv_node_nbr = {}
    disapv_node_nbr = {}
    while nodes:
        e = nodes.pop(0)
        if e in node_set_remaining:
            seen, apv, disapv = dfs(
                node=e,
                node_val_sorted=cc_node_sorted,
                node_set_remaining=node_set_remaining,
                graph_adj=graph_adj,
            )
            ### @@ e, seen
            # A {'C', 'D', 'F', 'A', 'B'}
            # E {'E'}
            cc_sub['node_' + str(e)] = list(seen)
            apv_node_nbr['node_' + str(e)] = apv
            disapv_node_nbr['node_' + str(e)] = disapv
            node_set_remaining = node_set_remaining - seen
            self.console.print('remaining: {}'.format(node_set_remaining))
            self.console.print('disapproval {}'.format(disapv))
            ### @@ print('disapproval {}'.format(disapv))
            # disapproval []
            # disapproval [[183, 103]]
            # disapproval [[131, 4], [131, 147]]
            # ...
            # disapproval [[133, 194]]
            # disapproval []
        else:
            continue
    ### @@ disapv_node_nbr
    # {'node_0': []}
    # {'node_36': [[183, 103]]}
    # {'node_29': [[131, 4], [131, 147]], 'node_4': []}
    # {'node_7': []}
    # {'node_28': [[8, 57]]}
    # ...
    # {'node_59': [[133, 194]]}
    # {'node_63': []}
    return cc_sub, apv_node_nbr, disapv_node_nbr

The directional_search function utilises a depth-first search (DFS) algorithm for travesing the UMI graph. It is written this way

Python

def dfs(
        node,
        node_val_sorted,
        node_set_remaining,
        graph_adj,
):
    visited = set()
    approval = []
    disapproval = []
    g = graph_adj
    def search(node):
        visited.add(node)
        self.console.print('======>visited UMI nodes: {}'.format(visited))
        self.console.print(visited)
        for neighbor in g[node]:
            self.console.print('=========>the neighbor: {}'.format(neighbor))
            if neighbor not in visited:
                if neighbor in node_set_remaining:
                    if node_val_sorted[node] >= 2 * node_val_sorted[neighbor] - 1:
                        approval.append([node, neighbor])
                        search(neighbor)
                    else:
                        disapproval.append([node, neighbor])
    search(node)
    ### @@ approval
    # {'cc_0': {'node_A': [['A', 'B'], ['A', 'C'], ['A', 'D'], ['D', 'F']], 'node_E': []}}
    ### @@ disapproval
    # {'cc_0': {'node_A': [['B', 'C'], ['D', 'E']], 'node_E': []}}
    return visited, approval, disapproval

Smith T, Heger A, Sudbery I. UMI-tools: modeling sequencing errors in Unique Molecular Identifiers to improve quantification accuracy. Genome Res. 2017 Mar;27(3):491-499. doi: 10.1101/gr.209601.116. Epub 2017 Jan 18. PMID: 28100584; PMCID: PMC5340976. ↩