PostgreSQL导入微博数据
Contents
微博日志的数据
使用的PG版本为9.6.0
文件的每一行的内容格式如下:
{"q":"keyword", "result":"statuses:[{
"id":xxxx,
"created_at":"Thu Jul 16 12:55:27 +0800 2015"
}]"}
关于导入JSON数据,可以参考上一徬 PostgresSQL导入json数据.
详细步骤
-- 创建父表
CREATE TABLE status(data jsonb);
-- 按年分区
CREATE TABLE status2016(
CHECK ( date((data->>'created_at')::timestamp with time zone) >= DATE '2016-01-01' AND date((data->>'created_at')::timestamp with time zone) < DATE '2017-01-01' )
) inherits (status);
CREATE TABLE status2015(
CHECK ( date((data->>'created_at')::timestamp with time zone) >= DATE '2015-01-01' AND date((data->>'created_at')::timestamp with time zone) < DATE '2016-01-01' )
) inherits (status);
CREATE TABLE status2014(
CHECK ( date((data->>'created_at')::timestamp with time zone) >= DATE '2014-01-01' AND date((data->>'created_at')::timestamp with time zone) < DATE '2015-01-01' )
) inherits (status);
CREATE TABLE status2013(
CHECK ( date((data->>'created_at')::timestamp with time zone) >= DATE '2013-01-01' AND date((data->>'created_at')::timestamp with time zone) < DATE '2014-01-01' )
) inherits (status);
CREATE TABLE status2012(
CHECK ( date((data->>'created_at')::timestamp with time zone) >= DATE '2012-01-01' AND date((data->>'created_at')::timestamp with time zone) < DATE '2013-01-01' )
) inherits (status);
CREATE TABLE status2011(
CHECK ( date((data->>'created_at')::timestamp with time zone) >= DATE '2011-01-01' AND date((data->>'created_at')::timestamp with time zone) < DATE '2012-01-01' )
) inherits (status);
CREATE TABLE status2010(
CHECK ( date((data->>'created_at')::timestamp with time zone) >= DATE '2010-01-01' AND date((data->>'created_at')::timestamp with time zone) < DATE '2011-01-01' )
) inherits (status);
CREATE TABLE status2009(
CHECK ( date((data->>'created_at')::timestamp with time zone) >= DATE '2009-01-01' AND date((data->>'created_at')::timestamp with time zone) < DATE '2010-01-01' )
) inherits (status);
CREATE TABLE status2008(
CHECK ( date((data->>'created_at')::timestamp with time zone) >= DATE '2008-01-01' AND date((data->>'created_at')::timestamp with time zone) < DATE '2009-01-01' )
) inherits (status);
CREATE TABLE status2007(
CHECK ( date((data->>'created_at')::timestamp with time zone) >= DATE '2007-01-01' AND date((data->>'created_at')::timestamp with time zone) < DATE '2008-01-01' )
) inherits (status);
CREATE TABLE status2006(
CHECK ( date((data->>'created_at')::timestamp with time zone) >= DATE '2006-01-01' AND date((data->>'created_at')::timestamp with time zone) < DATE '2007-01-01' )
) inherits (status);
-- 创建id的唯一索引
CREATE UNIQUE INDEX status_id_uniq ON status ( (data->>'id') );
CREATE UNIQUE INDEX status2016_id_uniq ON status2016 ( (data->>'id') );
CREATE UNIQUE INDEX status2015_id_uniq ON status2015 ( (data->>'id') );
CREATE UNIQUE INDEX status2014_id_uniq ON status2014 ( (data->>'id') );
CREATE UNIQUE INDEX status2013_id_uniq ON status2013 ( (data->>'id') );
CREATE UNIQUE INDEX status2012_id_uniq ON status2012 ( (data->>'id') );
CREATE UNIQUE INDEX status2011_id_uniq ON status2011 ( (data->>'id') );
CREATE UNIQUE INDEX status2010_id_uniq ON status2010 ( (data->>'id') );
CREATE UNIQUE INDEX status2009_id_uniq ON status2009 ( (data->>'id') );
CREATE UNIQUE INDEX status2008_id_uniq ON status2008 ( (data->>'id') );
CREATE UNIQUE INDEX status2007_id_uniq ON status2007 ( (data->>'id') );
CREATE UNIQUE INDEX status2006_id_uniq ON status2006 ( (data->>'id') );
-- 创建分区的触发器
CREATE OR REPLACE FUNCTION status_insert_trigger()
RETURNS TRIGGER AS $$
BEGIN
IF ( date((NEW.data->>'created_at')::timestamp with time zone) >= DATE '2016-01-01' AND
date((NEW.data->>'created_at')::timestamp with time zone) < DATE '2017-01-01' ) THEN
INSERT INTO status2016 VALUES (NEW.*) ON CONFLICT DO NOTHING;
ELSIF ( date((NEW.data->>'created_at')::timestamp with time zone) >= DATE '2015-01-01' AND
date((NEW.data->>'created_at')::timestamp with time zone) < DATE '2016-01-01') THEN
INSERT INTO status2015 VALUES (NEW.*) ON CONFLICT DO NOTHING;
ELSIF ( date((NEW.data->>'created_at')::timestamp with time zone) >= DATE '2014-01-01' AND
date((NEW.data->>'created_at')::timestamp with time zone) < DATE '2015-01-01') THEN
INSERT INTO status2014 VALUES (NEW.*) ON CONFLICT DO NOTHING;
ELSIF ( date((NEW.data->>'created_at')::timestamp with time zone) >= DATE '2013-01-01' AND
date((NEW.data->>'created_at')::timestamp with time zone) < DATE '2014-01-01') THEN
INSERT INTO status2013 VALUES (NEW.*) ON CONFLICT DO NOTHING;
ELSIF ( date((NEW.data->>'created_at')::timestamp with time zone) >= DATE '2012-01-01' AND
date((NEW.data->>'created_at')::timestamp with time zone) < DATE '2013-03-01') THEN
INSERT INTO status2012 VALUES (NEW.*) ON CONFLICT DO NOTHING;
ELSIF ( date((NEW.data->>'created_at')::timestamp with time zone) >= DATE '2011-01-01' AND
date((NEW.data->>'created_at')::timestamp with time zone) < DATE '2012-01-01') THEN
INSERT INTO status2011 VALUES (NEW.*) ON CONFLICT DO NOTHING;
ELSIF ( date((NEW.data->>'created_at')::timestamp with time zone) >= DATE '2010-01-01' AND
date((NEW.data->>'created_at')::timestamp with time zone) < DATE '2011-01-01') THEN
INSERT INTO status2010 VALUES (NEW.*) ON CONFLICT DO NOTHING;
ELSIF ( date((NEW.data->>'created_at')::timestamp with time zone) >= DATE '2009-01-01' AND
date((NEW.data->>'created_at')::timestamp with time zone) < DATE '2010-01-01') THEN
INSERT INTO status2009 VALUES (NEW.*) ON CONFLICT DO NOTHING;
ELSIF ( date((NEW.data->>'created_at')::timestamp with time zone) >= DATE '2008-01-01' AND
date((NEW.data->>'created_at')::timestamp with time zone) < DATE '2009-01-01') THEN
INSERT INTO status2008 VALUES (NEW.*) ON CONFLICT DO NOTHING;
ELSIF ( date((NEW.data->>'created_at')::timestamp with time zone) >= DATE '2007-01-01' AND
date((NEW.data->>'created_at')::timestamp with time zone) < DATE '2008-01-01') THEN
INSERT INTO status2007 VALUES (NEW.*) ON CONFLICT DO NOTHING;
ELSIF ( date((NEW.data->>'created_at')::timestamp with time zone) >= DATE '2006-01-01' AND
date((NEW.data->>'created_at')::timestamp with time zone) < DATE '2007-01-01') THEN
INSERT INTO status2006 VALUES (NEW.*) ON CONFLICT DO NOTHING;
ELSE
RAISE EXCEPTION 'Date out of range. Fix the measurement_insert_trigger() function! %\n', NEW.data;
END IF;
RETURN NULL;
END;
$$
LANGUAGE plpgsql;
-- 设置触发器
CREATE TRIGGER status_insert_trigger
BEFORE INSERT ON status
FOR EACH ROW EXECUTE PROCEDURE status_insert_trigger();
-- 将数组的微博数据,拆分成每一条微博的数据并插入到status表
insert into status select jsonb_array_elements(data->'result'->'statuses') from import_status where jsonb_array_length(data->'result'->'statuses') > 0 ON CONFLICT DO NOTHING;
-- 创建时间字段的索引
CREATE OR REPLACE FUNCTION f_to_imts(text)
RETURNS timestamptz AS
$$SELECT $1::timestamp with time zone $$
LANGUAGE sql IMMUTABLE;
-- CREATE INDEX status_created_at ON status USING btree ( f_to_imts(data->>'created_at') );
-- SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status_created_at on status ( f_to_imts((data->>'created_at')) );
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2016_created_at on status2016 ( f_to_imts((data->>'created_at')) );
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2015_created_at on status2015 ( f_to_imts((data->>'created_at')) );
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2014_created_at on status2014 ( f_to_imts((data->>'created_at')) );
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2013_created_at on status2013 ( f_to_imts((data->>'created_at')) );
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2012_created_at on status2012 ( f_to_imts((data->>'created_at')) );
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2011_created_at on status2011 ( f_to_imts((data->>'created_at')) );
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2010_created_at on status2010 ( f_to_imts((data->>'created_at')) );
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2009_created_at on status2009 ( f_to_imts((data->>'created_at')) );
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2008_created_at on status2008 ( f_to_imts((data->>'created_at')) );
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2007_created_at on status2007 ( f_to_imts((data->>'created_at')) );
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2006_created_at on status2006 ( f_to_imts((data->>'created_at')) );
-- SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status_text on status using gin(to_tsvector('testzhcfg', data->>'text'));
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2016_text on status2016 using gin(to_tsvector('testzhcfg', data->>'text'));
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2015_text on status2015 using gin(to_tsvector('testzhcfg', data->>'text'));
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2014_text on status2014 using gin(to_tsvector('testzhcfg', data->>'text'));
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2013_text on status2013 using gin(to_tsvector('testzhcfg', data->>'text'));
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2012_text on status2012 using gin(to_tsvector('testzhcfg', data->>'text'));
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2011_text on status2011 using gin(to_tsvector('testzhcfg', data->>'text'));
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2010_text on status2010 using gin(to_tsvector('testzhcfg', data->>'text'));
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2009_text on status2009 using gin(to_tsvector('testzhcfg', data->>'text'));
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2008_text on status2008 using gin(to_tsvector('testzhcfg', data->>'text'));
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2007_text on status2007 using gin(to_tsvector('testzhcfg', data->>'text'));
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2006_text on status2006 using gin(to_tsvector('testzhcfg', data->>'text'));
-- text字段的全文索引
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2016_created_at on status2016 ( f_to_imts((data->>'created_at')) );
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2015_created_at on status2015 ( f_to_imts((data->>'created_at')) );
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2014_created_at on status2014 ( f_to_imts((data->>'created_at')) );
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2013_created_at on status2013 ( f_to_imts((data->>'created_at')) );
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2012_created_at on status2012 ( f_to_imts((data->>'created_at')) );
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2011_created_at on status2011 ( f_to_imts((data->>'created_at')) );
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2010_created_at on status2010 ( f_to_imts((data->>'created_at')) );
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2009_created_at on status2009 ( f_to_imts((data->>'created_at')) );
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2008_created_at on status2008 ( f_to_imts((data->>'created_at')) );
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2007_created_at on status2007 ( f_to_imts((data->>'created_at')) );
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2006_created_at on status2006 ( f_to_imts((data->>'created_at')) );
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2016_text on status2016 using gin(to_tsvector('testzhcfg', data->>'text'));
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2015_text on status2015 using gin(to_tsvector('testzhcfg', data->>'text'));
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2014_text on status2014 using gin(to_tsvector('testzhcfg', data->>'text'));
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2013_text on status2013 using gin(to_tsvector('testzhcfg', data->>'text'));
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2012_text on status2012 using gin(to_tsvector('testzhcfg', data->>'text'));
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2011_text on status2011 using gin(to_tsvector('testzhcfg', data->>'text'));
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2010_text on status2010 using gin(to_tsvector('testzhcfg', data->>'text'));
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2009_text on status2009 using gin(to_tsvector('testzhcfg', data->>'text'));
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2008_text on status2008 using gin(to_tsvector('testzhcfg', data->>'text'));
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2007_text on status2007 using gin(to_tsvector('testzhcfg', data->>'text'));
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2006_text on status2006 using gin(to_tsvector('testzhcfg', data->>'text'));
– user id 索引
CREATE INDEX CONCURRENTLY index_status_2016_user_id ON status2016 ( ((data->'user'->>'id')::bigint) );
CREATE INDEX CONCURRENTLY index_status_2015_user_id ON status2015 ( ((data->'user'->>'id')::bigint) );
CREATE INDEX CONCURRENTLY index_status_2014_user_id ON status2014 ( ((data->'user'->>'id')::bigint) );
CREATE INDEX CONCURRENTLY index_status_2013_user_id ON status2013 ( ((data->'user'->>'id')::bigint) );
CREATE INDEX CONCURRENTLY index_status_2012_user_id ON status2012 ( ((data->'user'->>'id')::bigint) );
CREATE INDEX CONCURRENTLY index_status_2011_user_id ON status2011 ( ((data->'user'->>'id')::bigint) );
CREATE INDEX CONCURRENTLY index_status_2010_user_id ON status2010 ( ((data->'user'->>'id')::bigint) );
CREATE INDEX CONCURRENTLY index_status_2009_user_id ON status2009 ( ((data->'user'->>'id')::bigint) );
CREATE INDEX CONCURRENTLY index_status_2008_user_id ON status2008 ( ((data->'user'->>'id')::bigint) );
CREATE INDEX CONCURRENTLY index_status_2007_user_id ON status2007 ( ((data->'user'->>'id')::bigint) );
CREATE INDEX CONCURRENTLY index_status_2006_user_id ON status2006 ( ((data->'user'->>'id')::bigint) );
使用`user_id`索引
explain SELECT (data->'user'->>'id')::bigint from status order by (data->'user'->>'id')::bigint asc limit 100;
QUERY PLAN
---------------------------------------------------------------------------------------------------------------------------
Limit (cost=3.40..24.43 rows=100 width=8)
-> Result (cost=3.40..6920441.97 rows=32905725 width=8)
-> Merge Append (cost=3.40..6262327.47 rows=32905725 width=199)
Sort Key: ((((status.data -> 'user'::text) ->> 'id'::text))::bigint)
-> Sort (cost=0.01..0.02 rows=1 width=32)
Sort Key: ((((status.data -> 'user'::text) ->> 'id'::text))::bigint)
-> Seq Scan on status (cost=0.00..0.00 rows=1 width=32)
-> Index Scan using user_id on status2016 (cost=0.44..4321159.24 rows=29321520 width=190)
-> Index Scan using index_status_2015_user_id on status2015 (cost=0.43..269118.91 rows=1850299 width=207)
-> Index Scan using index_status_2014_user_id on status2014 (cost=0.43..256224.11 rows=1072512 width=317)
-> Index Scan using index_status_2013_user_id on status2013 (cost=0.42..119684.35 rows=546128 width=356)
-> Index Scan using index_status_2012_user_id on status2012 (cost=0.29..28551.76 rows=93031 width=530)
-> Index Scan using index_status_2011_user_id on status2011 (cost=0.29..4556.18 rows=15193 width=513)
-> Index Scan using index_status_2010_user_id on status2010 (cost=0.28..896.41 rows=2942 width=512)
-> Index Scan using index_status_2009_user_id on status2009 (cost=0.14..20.42 rows=19 width=32)
-> Index Scan using index_status_2008_user_id on status2008 (cost=0.15..64.55 rows=1360 width=32)
-> Index Scan using index_status_2007_user_id on status2007 (cost=0.15..64.55 rows=1360 width=32)
-> Index Scan using index_status_2006_user_id on status2006 (cost=0.15..64.55 rows=1360 width=32)
(18 rows)
Time: 36.967 ms
explain SELECT (data->'user'->>'id')::bigint from status order by (data->'user'->>'id') asc limit 100;
QUERY PLAN
-------------------------------------------------------------------------------------------------
Limit (cost=3445800.51..3445800.76 rows=100 width=40)
-> Sort (cost=3445800.51..3528064.83 rows=32905725 width=40)
Sort Key: (((status.data -> 'user'::text) ->> 'id'::text))
-> Result (cost=0.00..2188167.37 rows=32905725 width=40)
-> Append (cost=0.00..1365524.24 rows=32905725 width=199)
-> Seq Scan on status (cost=0.00..0.00 rows=1 width=32)
-> Seq Scan on status2016 (cost=0.00..1183150.20 rows=29321520 width=190)
-> Seq Scan on status2015 (cost=0.00..73767.99 rows=1850299 width=207)
-> Seq Scan on status2014 (cost=0.00..67815.12 rows=1072512 width=317)
-> Seq Scan on status2013 (cost=0.00..31833.28 rows=546128 width=356)
-> Seq Scan on status2012 (cost=0.00..7462.31 rows=93031 width=530)
-> Seq Scan on status2011 (cost=0.00..1189.93 rows=15193 width=513)
-> Seq Scan on status2010 (cost=0.00..231.42 rows=2942 width=512)
-> Seq Scan on status2009 (cost=0.00..3.19 rows=19 width=32)
-> Seq Scan on status2008 (cost=0.00..23.60 rows=1360 width=32)
-> Seq Scan on status2007 (cost=0.00..23.60 rows=1360 width=32)
-> Seq Scan on status2006 (cost=0.00..23.60 rows=1360 width=32)
(17 rows)
Time: 1.610 ms