为100亿内的数字建立md5彩虹表的数据仓库
Contents
思路
创建多张表, 规则如下(节省存储)
取 md5 的前2位作为后缀, 创建表名为 uid_${md5前2位字符}
. 表结构如下
CREATE TABLE uid_xx (
uid UInt64,
md5hash String
) ENGINE = MergeTree PARTITION BY substring(md5hash, 1, 2) ORDER BY md5hash SETTINGS index_granularity = 8192
这个可以编写个简单的程序来生成这些 SQL. md5 的16进制字符显示的: 0-9 以及 a-f. 然后有 16^2 = 256 张表.
生成数据
private static void gen() {
LongStream.range(0, 10000000000L).parallel().forEach(i -> {
final String md5 = DigestUtils.md5Hex(Long.toString(i));
final String logPrefix = md5.substring(0, 2);
final String data = md5.substring(2);
Loggers.logMap.get(logPrefix).info("{},{}", i, data);
});
}
这里的思路是, 为每一种前缀的输出, 用不同的日志文件来保存. 这样子, 在导入数据时, 就可以直接导入到相应的表中了.
导入数据
#!/usr/bin/env bash
ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")"
md5value=`echo ${ABSOLUTE_PATH} | md5sum | awk '{print $1}'`
echo "${md5value}"
exec 9>/tmp/${md5value}
if ! flock -n 9 ; then
echo "`date` another instance is running" >> /tmp/${md5value}.console
exit 1
fi
cd /home/sphinx/data/uid-md5-data/logs/app/
pwd
ss=`find . -name "*tmp*" | wc -l`
if [ ${ss} -gt 0 ]; then
echo "`date` has tmp" >> /tmp/${md5value}.console
exit 0
fi
for f in *.gz; do
# format => app_xx_yyyy-MM-dd.i.log.gz 要取 xx 出来
IFS='_' read -ra ADDR <<< "$f"
logPrefix="${ADDR[1]}"
echo "`date` dealing... ${f} " >> /tmp/${md5value}.console
zcat ${f} | clickhouse-client --query="insert into uid_${logPrefix} format CSV"
rm "${f}"
echo "`date` ok ${f} " >> /tmp/${md5value}.console
done
sql 生成器
将 md5String
变量的内容替换为你想查询的 md5 的值.
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
public class SQLGenerator {
public static void main(String[] args) {
final String md5String = "98b2ce0bd254a301d9b143997c4c6cbb,d32e19751a401b9d244f3eed7406ea46,920fcbce5ef5cb2fbf184cb8be840c1c,04fab5ba71e47964918a3a597f400c0a,d614a88db2ae0d166cdbb492901111c6,5134eae60c71fc6fcdeb83d6149977df,66e77ea229b4a99467f39e235e58a6f1,ef6770f4f19e61f71f5bd2289185e1c8";
final StringBuilder stringBuilder = new StringBuilder();
final String[] uidArr = md5String.split(",");
final List<String> md5List = Arrays.stream(uidArr).filter(e -> e != null && !"".equalsIgnoreCase(e.trim())).collect(Collectors.toList());
for (int i=0; i< md5List.size(); i++) {
final String md5 = md5List.get(i).toLowerCase();
final String prefix = md5.substring(0, 2);
final String sql;
if (i < uidArr.length - 1) {
sql = "select * from uid_" + prefix + " where md5hash = '" + md5.substring(2) + "' union all ";
} else {
sql = "select * from uid_" + prefix + " where md5hash = '" + md5.substring(2) + "' format CSV ;";
}
stringBuilder.append(sql);
}
System.out.println(stringBuilder);
}
}
查询性能
time clickhouse-client --query="select * from uid_98 where md5hash = 'b2ce0bd254a301d9b143997c4c6cbb' union all select * from uid_d3 where md5hash = '2e19751a401b9d244f3eed7406ea46' union all select * from uid_92 where md5hash = '0fcbce5ef5cb2fbf184cb8be840c1c' union all select * from uid_04 where md5hash = 'fab5ba71e47964918a3a597f400c0a' union all select * from uid_d6 where md5hash = '14a88db2ae0d166cdbb492901111c6' union all select * from uid_51 where md5hash = '34eae60c71fc6fcdeb83d6149977df' union all select * from uid_66 where md5hash = 'e77ea229b4a99467f39e235e58a6f1' union all select * from uid_ef where md5hash = '6770f4f19e61f71f5bd2289185e1c8' format CSV ;"
3306012437,"0fcbce5ef5cb2fbf184cb8be840c1c"
2104562961,"2e19751a401b9d244f3eed7406ea46"
5130863928,"fab5ba71e47964918a3a597f400c0a"
5311495303,"14a88db2ae0d166cdbb492901111c6"
5504360955,"34eae60c71fc6fcdeb83d6149977df"
5935463043,"e77ea229b4a99467f39e235e58a6f1"
5238869571,"6770f4f19e61f71f5bd2289185e1c8"
3389957234,"b2ce0bd254a301d9b143997c4c6cbb"
real 0m0.177s
user 0m0.012s
sys 0m0.028s
占用磁盘空间
du -sh ./* | sort -h
4.0K ./system
328G ./default