问题
这是一个经典题,面试的时候经常会遇到。
例如:文件的内容如下
26329
46184
94842
9036
96555
40954
38187
15548
51452
861
51010
8721
13666
69837
每行一个数字,找到出现次数最多的数。通常的解法是遍历文件一遍,用hashmap维护一个(number => freq),
然后用最小堆找到最大的10个数。
用Trie代替Hashmap
如果内存很小,hashmap放不下怎么办呢?可以使用Trie来代替HashMap。
但是我们这里的Trie在insert
一个串的时候会在叶子节点用一个count来计数。
这里的叶子节点特指isEnd
为true
的TrieNode。
通常Trie用于匹配字符串,会有startsWith
方法。但是我们这里不需要这个方法,而需要一个遍历
所有叶子节点的方法traverseLeaf
。下面就是我们的Trie
class。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118
| package ym.parallel;
import org.slf4j.Logger; import org.slf4j.LoggerFactory;
import java.util.ArrayDeque; import java.util.Deque; import java.util.HashMap; import java.util.Map;
public class Trie {
private static final Logger logger = LoggerFactory.getLogger(Trie.class); private TrieNode root;
public Trie() { root = new TrieNode(null); }
public void insert(String s) { int n = s.length(); TrieNode parent = root; TrieNode child = null; for (int i = 0; i < n; i++) { Character c = s.charAt(i);
if (parent.contains(c)) { child = parent.get(c); } else { child = new TrieNode(c); parent.put(c, child); child.parent = parent; }
if (i == n - 1) { child.isEnd = true; child.count += 1; } parent = child; } }
public void traverseLeaf(TrieClient client) { Deque<TrieNode> deque = new ArrayDeque<>(); deque.push(root);
while (!deque.isEmpty()) { TrieNode tmp = deque.pop(); if (tmp.isEnd) { client.visit(tmp); }
for (Map.Entry<Character, TrieNode> entry : tmp.children.entrySet()) { deque.push(entry.getValue()); } } }
public static class TrieNode { private HashMap<Character, TrieNode> children = new HashMap<>(); private boolean isEnd = false; private Character c; private Integer count = 0; private TrieNode parent = null;
TrieNode(Character c) { this.c = c; }
public void put(Character c, TrieNode node) { children.putIfAbsent(c, node); }
public boolean contains(Character c) { return children.containsKey(c); }
public TrieNode get(Character c) { return children.get(c); }
public Integer getCount() { return this.count; }
public String getString() { StringBuilder s = new StringBuilder(); TrieNode node = this; while (node.c != null) { s.insert(0, node.c.toString()); node = node.parent; } return s.toString(); } }
public interface TrieClient {
default void visit(TrieNode node) { logger.info(node.getString() + ":" + node.count); } }
public static void main(String[] args) { Trie trie = new Trie(); trie.insert("abc"); trie.insert("ab"); trie.insert("ab"); trie.insert("cde"); trie.insert("cde"); trie.traverseLeaf(new TrieClient() {});
}
}
|
运行main的结果如下:
2017-11-27 10:40:52 INFO Trie:103 cde:2
2017-11-27 10:40:52 INFO Trie:103 ab:2
2017-11-27 10:40:52 INFO Trie:103 abc:1
用Trie实现的Java代码
这个Trie
怎么用呢?以下是完整的代码。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
| package ym.parallel;
import com.google.common.base.MoreObjects; import org.junit.Before; import org.junit.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory;
import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.util.HashMap; import java.util.Map; import java.util.PriorityQueue;
public class SolutionTrie {
private static final Logger logger = LoggerFactory.getLogger(SolutionTrie.class);
private static String path; private static Trie trie = new Trie(); private static PriorityQueue<IPCounter> heap = new PriorityQueue<>(10);
@Before public void setUp() { path = "d:/tmp/log_ip.txt"; }
@Test public void findTopTen() {
logger.info("compute begin"); try (BufferedReader br = new BufferedReader(new FileReader(new File(path)));) { String line = null; while ((line = br.readLine()) != null) { trie.insert(line); }
} catch (IOException e) { e.printStackTrace(); } logger.info("read complete.");
trie.traverseLeaf(new Trie.TrieClient() { @Override public void visit(Trie.TrieNode node) { IPCounter ipCounter = new IPCounter(node.getString(), node.getCount()); if (heap.size() < 10) { heap.add(ipCounter); } else { if (heap.peek().count < ipCounter.count) { heap.poll(); heap.add(ipCounter); } } } });
while (!heap.isEmpty()) { logger.info(heap.poll().toString()); } logger.info("compute end"); }
private static class IPCounter implements Comparable { private String ip; private Integer count;
public IPCounter(String ip, Integer count) { this.ip = ip; this.count = count; }
@Override public int compareTo(Object o) { IPCounter ipCounter = (IPCounter) o; return this.count.compareTo(ipCounter.count); }
@Override public String toString() { return MoreObjects.toStringHelper(this).add("Ip", ip).add("count", count).toString(); } } }
|