从大文件中找到出现次数最多的10个数

问题

这是一个经典题,面试的时候经常会遇到。 例如:文件的内容如下

26329
46184
94842
9036
96555
40954
38187
15548
51452
861
51010
8721
13666
69837 

每行一个数字,找到出现次数最多的数。通常的解法是遍历文件一遍,用hashmap维护一个(number => freq), 然后用最小堆找到最大的10个数。

用Trie代替Hashmap

如果内存很小,hashmap放不下怎么办呢?可以使用Trie来代替HashMap。 但是我们这里的Trie在insert一个串的时候会在叶子节点用一个count来计数。 这里的叶子节点特指isEndtrue的TrieNode。 通常Trie用于匹配字符串,会有startsWith方法。但是我们这里不需要这个方法,而需要一个遍历 所有叶子节点的方法traverseLeaf。下面就是我们的Trieclass。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
package ym.parallel;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.ArrayDeque;
import java.util.Deque;
import java.util.HashMap;
import java.util.Map;

/**
* Created by ym on 11/26/2017.
*/
public class Trie {

private static final Logger logger = LoggerFactory.getLogger(Trie.class);
private TrieNode root;

public Trie() {
root = new TrieNode(null);
}

public void insert(String s) {
int n = s.length();
TrieNode parent = root;
TrieNode child = null;
for (int i = 0; i < n; i++) {
Character c = s.charAt(i);

if (parent.contains(c)) {
child = parent.get(c);
} else {
child = new TrieNode(c);
parent.put(c, child);
child.parent = parent;
}

if (i == n - 1) {
child.isEnd = true;
child.count += 1;
}
parent = child;
}
}

public void traverseLeaf(TrieClient client) {
Deque<TrieNode> deque = new ArrayDeque<>();
deque.push(root);

while (!deque.isEmpty()) {
TrieNode tmp = deque.pop();
if (tmp.isEnd) {
client.visit(tmp);
}

for (Map.Entry<Character, TrieNode> entry : tmp.children.entrySet()) {
deque.push(entry.getValue());
}
}
}

public static class TrieNode {
private HashMap<Character, TrieNode> children = new HashMap<>();
private boolean isEnd = false;
private Character c;
private Integer count = 0;
private TrieNode parent = null;

TrieNode(Character c) {
this.c = c;
}

public void put(Character c, TrieNode node) {
children.putIfAbsent(c, node);
}

public boolean contains(Character c) {
return children.containsKey(c);
}

public TrieNode get(Character c) {
return children.get(c);
}

public Integer getCount() {
return this.count;
}

public String getString() {
StringBuilder s = new StringBuilder();
TrieNode node = this;
while (node.c != null) {
s.insert(0, node.c.toString());
node = node.parent;
}
return s.toString();
}
}

public interface TrieClient {

default void visit(TrieNode node) {
logger.info(node.getString() + ":" + node.count);
}
}

public static void main(String[] args) {
Trie trie = new Trie();
trie.insert("abc");
trie.insert("ab");
trie.insert("ab");
trie.insert("cde");
trie.insert("cde");
trie.traverseLeaf(new TrieClient() {});

}

}

运行main的结果如下:

2017-11-27 10:40:52 INFO  Trie:103    cde:2
2017-11-27 10:40:52 INFO  Trie:103    ab:2
2017-11-27 10:40:52 INFO  Trie:103    abc:1

用Trie实现的Java代码

这个Trie怎么用呢?以下是完整的代码。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
package ym.parallel;

import com.google.common.base.MoreObjects;
import org.junit.Before;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.PriorityQueue;

/**
* Created by ym on 11/26/2017.
*/
public class SolutionTrie {

private static final Logger logger = LoggerFactory.getLogger(SolutionTrie.class);

private static String path;
private static Trie trie = new Trie();
private static PriorityQueue<IPCounter> heap = new PriorityQueue<>(10);

@Before
public void setUp() {
path = "d:/tmp/log_ip.txt";
}


@Test
public void findTopTen() {

logger.info("compute begin");
try (BufferedReader br = new BufferedReader(new FileReader(new File(path)));) {
String line = null;
while ((line = br.readLine()) != null) {
trie.insert(line);
}

} catch (IOException e) {
e.printStackTrace();
}
logger.info("read complete.");

trie.traverseLeaf(new Trie.TrieClient() {
@Override
public void visit(Trie.TrieNode node) {
IPCounter ipCounter = new IPCounter(node.getString(), node.getCount());
if (heap.size() < 10) {
heap.add(ipCounter);
} else {
if (heap.peek().count < ipCounter.count) {
heap.poll();
heap.add(ipCounter);
}
}
}
});

while (!heap.isEmpty()) {
logger.info(heap.poll().toString());
}
logger.info("compute end");
}


private static class IPCounter implements Comparable {
private String ip;
private Integer count;

public IPCounter(String ip, Integer count) {
this.ip = ip;
this.count = count;
}


@Override
public int compareTo(Object o) {
IPCounter ipCounter = (IPCounter) o;
return this.count.compareTo(ipCounter.count);
}

@Override
public String toString() {
return MoreObjects.toStringHelper(this).add("Ip", ip).add("count", count).toString();
}
}
}


从大文件中找到出现次数最多的10个数
https://threelambda.com/2017/11/27/find-top-n/
作者
Ming Yang
发布于
2017年11月27日
许可协议