5. Directional

Since Cluster and Adjacency often led to an underestimated number and an overestimated number of unique UMIs, the Directional method was developed to seek for a more balanced estimation between somewhat of the two extremes. Its deduplication process can coarsely be described as a directed edge-visiting strategy, that is, merging node B by node A if the count of node A is at least two-fold greater than that of node B. The Directional method in the UMI-tools suite is reported to gain the highest accuracy in identifying PCR duplicates1.

Similar to the Adjacency method, Directional also draws on the information about the count of UMIs. In the example graph, the counts of the 7 unique UMIs are given in node_val_sorted.

1
2
3
4
5
6
7
8
9
node_val_sorted = pd.Series({
    'A': 120,
    'D': 90,
    'E': 50,
    'G': 5,
    'B': 2,
    'C': 2,
    'F': 1,
})

Python

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
import umiche as uc

ccs = uc.dedup.cluster(
    graph=graph_adj,
    method='deque',
)

dedup_res = directional(
    connected_components=ccs,
    df_umi_uniq_val_cnt=node_val_sorted,
    graph_adj=graph_adj,
    verbose=True,
)
dedup_count = dedup_res['count']
dedup_clusters = dedup_res['clusters']
print("deduplicated count:\n{}".format(dedup_count))
print("deduplicated clusters:\n{}".format(dedup_clusters))
dedup_clusters_dc = decompose(dedup_clusters)
print("deduplicated clusters decomposed:\n{}".format(dedup_clusters_dc))

console

30/07/2024 15:52:33 logger: ======>visited UMI nodes: {'A'}
30/07/2024 15:52:33 logger: {'A'}
30/07/2024 15:52:33 logger: =========>the neighbor: B
30/07/2024 15:52:33 logger: ======>visited UMI nodes: {'A', 'B'}
30/07/2024 15:52:33 logger: {'A', 'B'}
30/07/2024 15:52:33 logger: =========>the neighbor: A
30/07/2024 15:52:33 logger: =========>the neighbor: C
30/07/2024 15:52:33 logger: =========>the neighbor: C
30/07/2024 15:52:33 logger: ======>visited UMI nodes: {'A', 'B', 'C'}
30/07/2024 15:52:33 logger: {'A', 'B', 'C'}
30/07/2024 15:52:33 logger: =========>the neighbor: A
30/07/2024 15:52:33 logger: =========>the neighbor: B
30/07/2024 15:52:33 logger: =========>the neighbor: D
30/07/2024 15:52:33 logger: remaining: {'D', 'F', 'G', 'E'}
30/07/2024 15:52:33 logger: disapproval [['B', 'C'], ['A', 'D']]
30/07/2024 15:52:33 logger: ======>visited UMI nodes: {'D'}
30/07/2024 15:52:33 logger: {'D'}
30/07/2024 15:52:33 logger: =========>the neighbor: A
30/07/2024 15:52:33 logger: =========>the neighbor: E
30/07/2024 15:52:33 logger: =========>the neighbor: F
30/07/2024 15:52:33 logger: ======>visited UMI nodes: {'D', 'F'}
30/07/2024 15:52:33 logger: {'D', 'F'}
30/07/2024 15:52:33 logger: =========>the neighbor: D
30/07/2024 15:52:33 logger: =========>the neighbor: G
30/07/2024 15:52:33 logger: remaining: {'G', 'E'}
30/07/2024 15:52:33 logger: disapproval [['D', 'E'], ['F', 'G']]
30/07/2024 15:52:33 logger: ======>visited UMI nodes: {'E'}
30/07/2024 15:52:33 logger: {'E'}
30/07/2024 15:52:33 logger: =========>the neighbor: D
30/07/2024 15:52:33 logger: =========>the neighbor: G
30/07/2024 15:52:33 logger: ======>visited UMI nodes: {'G', 'E'}
30/07/2024 15:52:33 logger: {'G', 'E'}
30/07/2024 15:52:33 logger: =========>the neighbor: E
30/07/2024 15:52:33 logger: =========>the neighbor: F
30/07/2024 15:52:33 logger: remaining: set()
30/07/2024 15:52:33 logger: disapproval []
deduplicated count:
3
deduplicated clusters:
{'cc_0': {'node_A': ['A', 'B', 'C'], 'node_D': ['D', 'F'], 'node_E': ['G', 'E']}}
deduplicated clusters decomposed:
{0: ['A', 'B', 'C'], 1: ['D', 'F'], 2: ['G', 'E']}

Deduplicated UMI count

There are 3 connected subcomponents (node_A, node_D, and node_E) in connected component cc_0 and therefore, the deduplicated count of UMIs from 7 unique UMIs is 3 at the single locus.

In addition, if we want to explore if merged UMIs are of the same or different origin, we can also output the statistics.

Python

1
2
3
4
5
# the same orgin
print(dedup_res['apv'])

# the different origin
print(dedup_res['disapv'])

console

{'cc_0': {'node_A': [['A', 'B'], ['A', 'C']], 'node_D': [['D', 'F']], 'node_E': [['E', 'G']]}}
{'cc_0': {'node_A': [['B', 'C'], ['A', 'D']], 'node_D': [['D', 'E'], ['F', 'G']], 'node_E': []}}

We implemented the Directional method in UMIche.

Python

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
def directional(
    connected_components,
    df_umi_uniq_val_cnt,
    graph_adj,
):
    cc_sub_cnt = []
    cc_subs = {}
    cc_apvs = {}
    cc_disapvs = {}
    for i, cc in connected_components.items():

        cc_sub, apv_node_nbr, disapv_node_nbr = umi_tools_(
            df_umi_uniq_val_cnt=df_umi_uniq_val_cnt,
            cc=cc,
            graph_adj=graph_adj,
        )
        cc_sub_cnt.append(len(cc_sub))
        cc_subs['cc_' + str(i)] = cc_sub
        cc_apvs['cc_' + str(i)] = apv_node_nbr
        cc_disapvs['cc_' + str(i)] = disapv_node_nbr
    # print(sum(cc_sub_cnt))
    # print(cc_subs)
    # print(cc_apvs)
    # print(cc_disapvs)
    if self.heterogeneity:
        return (
            sum(cc_sub_cnt),
            cc_subs,
            cc_apvs,
            cc_disapvs,
        )
    else:
        return {
            "count": sum(cc_sub_cnt),
            "clusters": cc_subs,
            "apv": cc_apvs,
            "disapv": cc_disapvs,
        }

def directional_search(
    df_umi_uniq_val_cnt,
    cc,
    graph_adj,
):
    cc_node_sorted = df_umi_uniq_val_cnt.loc[df_umi_uniq_val_cnt.index.isin(cc)].sort_values(ascending=False).to_dict()
    ### @@ cc_umi_sorted
    # {'A': 456, 'E': 90, 'D': 72, 'B': 2, 'C': 2, 'F': 1}
    nodes = [*cc_node_sorted.keys()]
    # print(nodes)
    ### @@ cc_sorted
    # ['A', 'E', 'D', 'B', 'C', 'F']
    node_cp = nodes.copy()
    node_set_remaining = set(node_cp)
    ### @@ node_set_remaining
    # {'C', 'F', 'E', 'B', 'D', 'A'}
    cc_sub = {}
    apv_node_nbr = {}
    disapv_node_nbr = {}
    while nodes:
        e = nodes.pop(0)
        if e in node_set_remaining:
            seen, apv, disapv = dfs(
                node=e,
                node_val_sorted=cc_node_sorted,
                node_set_remaining=node_set_remaining,
                graph_adj=graph_adj,
            )
            ### @@ e, seen
            # A {'C', 'D', 'F', 'A', 'B'}
            # E {'E'}
            cc_sub['node_' + str(e)] = list(seen)
            apv_node_nbr['node_' + str(e)] = apv
            disapv_node_nbr['node_' + str(e)] = disapv
            node_set_remaining = node_set_remaining - seen
            self.console.print('remaining: {}'.format(node_set_remaining))
            self.console.print('disapproval {}'.format(disapv))
            ### @@ print('disapproval {}'.format(disapv))
            # disapproval []
            # disapproval [[183, 103]]
            # disapproval [[131, 4], [131, 147]]
            # ...
            # disapproval [[133, 194]]
            # disapproval []
        else:
            continue
    ### @@ disapv_node_nbr
    # {'node_0': []}
    # {'node_36': [[183, 103]]}
    # {'node_29': [[131, 4], [131, 147]], 'node_4': []}
    # {'node_7': []}
    # {'node_28': [[8, 57]]}
    # ...
    # {'node_59': [[133, 194]]}
    # {'node_63': []}
    return cc_sub, apv_node_nbr, disapv_node_nbr

The directional_search function utilises a depth-first search (DFS) algorithm for travesing the UMI graph. It is written this way

Python

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
def dfs(
        node,
        node_val_sorted,
        node_set_remaining,
        graph_adj,
):
    visited = set()
    approval = []
    disapproval = []
    g = graph_adj
    def search(node):
        visited.add(node)
        self.console.print('======>visited UMI nodes: {}'.format(visited))
        self.console.print(visited)
        for neighbor in g[node]:
            self.console.print('=========>the neighbor: {}'.format(neighbor))
            if neighbor not in visited:
                if neighbor in node_set_remaining:
                    if node_val_sorted[node] >= 2 * node_val_sorted[neighbor] - 1:
                        approval.append([node, neighbor])
                        search(neighbor)
                    else:
                        disapproval.append([node, neighbor])
    search(node)
    ### @@ approval
    # {'cc_0': {'node_A': [['A', 'B'], ['A', 'C'], ['A', 'D'], ['D', 'F']], 'node_E': []}}
    ### @@ disapproval
    # {'cc_0': {'node_A': [['B', 'C'], ['D', 'E']], 'node_E': []}}
    return visited, approval, disapproval


  1. Smith T, Heger A, Sudbery I. UMI-tools: modeling sequencing errors in Unique Molecular Identifiers to improve quantification accuracy. Genome Res. 2017 Mar;27(3):491-499. doi: 10.1101/gr.209601.116. Epub 2017 Jan 18. PMID: 28100584; PMCID: PMC5340976.