微博日志的数据

使用的PG版本为9.6.0

文件的每一行的内容格式如下:

{"q":"keyword", "result":"statuses:[{
"id":xxxx,
"created_at":"Thu Jul 16 12:55:27 +0800 2015"
}]"}

关于导入JSON数据,可以参考上一徬 PostgresSQL导入json数据.

详细步骤

-- 创建父表
CREATE TABLE status(data jsonb);

-- 按年分区

CREATE TABLE status2016(
	CHECK ( date((data->>'created_at')::timestamp with time zone) >= DATE '2016-01-01' AND date((data->>'created_at')::timestamp with time zone) < DATE '2017-01-01' )
) inherits (status);
CREATE TABLE status2015(
	CHECK ( date((data->>'created_at')::timestamp with time zone) >= DATE '2015-01-01' AND date((data->>'created_at')::timestamp with time zone) < DATE '2016-01-01' )
) inherits (status);
CREATE TABLE status2014(
	CHECK ( date((data->>'created_at')::timestamp with time zone) >= DATE '2014-01-01' AND date((data->>'created_at')::timestamp with time zone) < DATE '2015-01-01' )
) inherits (status);
CREATE TABLE status2013(
	CHECK ( date((data->>'created_at')::timestamp with time zone) >= DATE '2013-01-01' AND date((data->>'created_at')::timestamp with time zone) < DATE '2014-01-01' )
) inherits (status);
CREATE TABLE status2012(
	CHECK ( date((data->>'created_at')::timestamp with time zone) >= DATE '2012-01-01' AND date((data->>'created_at')::timestamp with time zone) < DATE '2013-01-01' )
) inherits (status);
CREATE TABLE status2011(
	CHECK ( date((data->>'created_at')::timestamp with time zone) >= DATE '2011-01-01' AND date((data->>'created_at')::timestamp with time zone) < DATE '2012-01-01' )
) inherits (status);
CREATE TABLE status2010(
	CHECK ( date((data->>'created_at')::timestamp with time zone) >= DATE '2010-01-01' AND date((data->>'created_at')::timestamp with time zone) < DATE '2011-01-01' )
) inherits (status);
CREATE TABLE status2009(
	CHECK ( date((data->>'created_at')::timestamp with time zone) >= DATE '2009-01-01' AND date((data->>'created_at')::timestamp with time zone) < DATE '2010-01-01' )
) inherits (status);
CREATE TABLE status2008(
	CHECK ( date((data->>'created_at')::timestamp with time zone) >= DATE '2008-01-01' AND date((data->>'created_at')::timestamp with time zone) < DATE '2009-01-01' )
) inherits (status);
CREATE TABLE status2007(
	CHECK ( date((data->>'created_at')::timestamp with time zone) >= DATE '2007-01-01' AND date((data->>'created_at')::timestamp with time zone) < DATE '2008-01-01' )
) inherits (status);
CREATE TABLE status2006(
	CHECK ( date((data->>'created_at')::timestamp with time zone) >= DATE '2006-01-01' AND date((data->>'created_at')::timestamp with time zone) < DATE '2007-01-01' )	
) inherits (status);


-- 创建id的唯一索引

CREATE UNIQUE INDEX status_id_uniq  ON status ( (data->>'id') );
CREATE UNIQUE INDEX status2016_id_uniq  ON status2016 ( (data->>'id') );
CREATE UNIQUE INDEX status2015_id_uniq  ON status2015 ( (data->>'id') );
CREATE UNIQUE INDEX status2014_id_uniq  ON status2014 ( (data->>'id') );
CREATE UNIQUE INDEX status2013_id_uniq  ON status2013 ( (data->>'id') );
CREATE UNIQUE INDEX status2012_id_uniq  ON status2012 ( (data->>'id') );
CREATE UNIQUE INDEX status2011_id_uniq  ON status2011 ( (data->>'id') );
CREATE UNIQUE INDEX status2010_id_uniq  ON status2010 ( (data->>'id') );
CREATE UNIQUE INDEX status2009_id_uniq  ON status2009 ( (data->>'id') );
CREATE UNIQUE INDEX status2008_id_uniq  ON status2008 ( (data->>'id') );
CREATE UNIQUE INDEX status2007_id_uniq  ON status2007 ( (data->>'id') );
CREATE UNIQUE INDEX status2006_id_uniq  ON status2006 ( (data->>'id') );


-- 创建分区的触发器
CREATE OR REPLACE FUNCTION status_insert_trigger()
RETURNS TRIGGER AS $$
BEGIN
    IF ( date((NEW.data->>'created_at')::timestamp with time zone) >= DATE '2016-01-01' AND
         date((NEW.data->>'created_at')::timestamp with time zone) < DATE '2017-01-01' ) THEN
        INSERT INTO status2016 VALUES (NEW.*)  ON CONFLICT DO NOTHING;
    ELSIF ( date((NEW.data->>'created_at')::timestamp with time zone) >= DATE '2015-01-01' AND
         date((NEW.data->>'created_at')::timestamp with time zone) < DATE '2016-01-01') THEN
        INSERT INTO status2015 VALUES (NEW.*)  ON CONFLICT DO NOTHING;
    ELSIF ( date((NEW.data->>'created_at')::timestamp with time zone) >= DATE '2014-01-01' AND
         date((NEW.data->>'created_at')::timestamp with time zone) < DATE '2015-01-01') THEN
        INSERT INTO status2014 VALUES (NEW.*)  ON CONFLICT DO NOTHING;
    ELSIF ( date((NEW.data->>'created_at')::timestamp with time zone) >= DATE '2013-01-01' AND
         date((NEW.data->>'created_at')::timestamp with time zone) < DATE '2014-01-01') THEN
        INSERT INTO status2013 VALUES (NEW.*)  ON CONFLICT DO NOTHING;
    ELSIF ( date((NEW.data->>'created_at')::timestamp with time zone) >= DATE '2012-01-01' AND
         date((NEW.data->>'created_at')::timestamp with time zone) < DATE '2013-03-01') THEN
        INSERT INTO status2012 VALUES (NEW.*)  ON CONFLICT DO NOTHING;
    ELSIF ( date((NEW.data->>'created_at')::timestamp with time zone) >= DATE '2011-01-01' AND
         date((NEW.data->>'created_at')::timestamp with time zone) < DATE '2012-01-01') THEN
        INSERT INTO status2011 VALUES (NEW.*)  ON CONFLICT DO NOTHING;
    ELSIF ( date((NEW.data->>'created_at')::timestamp with time zone) >= DATE '2010-01-01' AND
         date((NEW.data->>'created_at')::timestamp with time zone) < DATE '2011-01-01') THEN
        INSERT INTO status2010 VALUES (NEW.*)  ON CONFLICT DO NOTHING;
    ELSIF ( date((NEW.data->>'created_at')::timestamp with time zone) >= DATE '2009-01-01' AND
         date((NEW.data->>'created_at')::timestamp with time zone) < DATE '2010-01-01') THEN
        INSERT INTO status2009 VALUES (NEW.*)  ON CONFLICT DO NOTHING;
    ELSIF ( date((NEW.data->>'created_at')::timestamp with time zone) >= DATE '2008-01-01' AND
         date((NEW.data->>'created_at')::timestamp with time zone) < DATE '2009-01-01') THEN
        INSERT INTO status2008 VALUES (NEW.*)  ON CONFLICT DO NOTHING;
    ELSIF ( date((NEW.data->>'created_at')::timestamp with time zone) >= DATE '2007-01-01' AND
         date((NEW.data->>'created_at')::timestamp with time zone) < DATE '2008-01-01') THEN
        INSERT INTO status2007 VALUES (NEW.*)  ON CONFLICT DO NOTHING;
    ELSIF ( date((NEW.data->>'created_at')::timestamp with time zone) >= DATE '2006-01-01' AND
         date((NEW.data->>'created_at')::timestamp with time zone) < DATE '2007-01-01') THEN
        INSERT INTO status2006 VALUES (NEW.*)  ON CONFLICT DO NOTHING;
    ELSE
        RAISE EXCEPTION 'Date out of range.  Fix the measurement_insert_trigger() function! %\n', NEW.data;
    END IF;
    RETURN NULL;
END;
$$
LANGUAGE plpgsql;


-- 设置触发器
CREATE TRIGGER status_insert_trigger
    BEFORE INSERT ON status
    FOR EACH ROW EXECUTE PROCEDURE status_insert_trigger();


-- 将数组的微博数据,拆分成每一条微博的数据并插入到status表
insert into status select jsonb_array_elements(data->'result'->'statuses')   from import_status where jsonb_array_length(data->'result'->'statuses') > 0 ON CONFLICT DO NOTHING;


-- 创建时间字段的索引
CREATE OR REPLACE FUNCTION f_to_imts(text)
  RETURNS timestamptz AS
$$SELECT $1::timestamp with time zone $$
  LANGUAGE sql IMMUTABLE;

-- CREATE INDEX status_created_at ON status USING btree ( f_to_imts(data->>'created_at') );


-- SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status_created_at on status ( f_to_imts((data->>'created_at'))  );
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2016_created_at on status2016 ( f_to_imts((data->>'created_at'))  );
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2015_created_at on status2015 ( f_to_imts((data->>'created_at'))  );
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2014_created_at on status2014 ( f_to_imts((data->>'created_at'))  );
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2013_created_at on status2013 ( f_to_imts((data->>'created_at'))  );
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2012_created_at on status2012 ( f_to_imts((data->>'created_at'))  );
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2011_created_at on status2011 ( f_to_imts((data->>'created_at'))  );
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2010_created_at on status2010 ( f_to_imts((data->>'created_at'))  );
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2009_created_at on status2009 ( f_to_imts((data->>'created_at'))  );
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2008_created_at on status2008 ( f_to_imts((data->>'created_at'))  );
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2007_created_at on status2007 ( f_to_imts((data->>'created_at'))  );
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2006_created_at on status2006 ( f_to_imts((data->>'created_at'))  );


-- SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status_text on status using gin(to_tsvector('testzhcfg', data->>'text'));
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2016_text on status2016 using gin(to_tsvector('testzhcfg', data->>'text'));
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2015_text on status2015 using gin(to_tsvector('testzhcfg', data->>'text'));
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2014_text on status2014 using gin(to_tsvector('testzhcfg', data->>'text'));
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2013_text on status2013 using gin(to_tsvector('testzhcfg', data->>'text'));
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2012_text on status2012 using gin(to_tsvector('testzhcfg', data->>'text'));
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2011_text on status2011 using gin(to_tsvector('testzhcfg', data->>'text'));
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2010_text on status2010 using gin(to_tsvector('testzhcfg', data->>'text'));
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2009_text on status2009 using gin(to_tsvector('testzhcfg', data->>'text'));
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2008_text on status2008 using gin(to_tsvector('testzhcfg', data->>'text'));
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2007_text on status2007 using gin(to_tsvector('testzhcfg', data->>'text'));
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2006_text on status2006 using gin(to_tsvector('testzhcfg', data->>'text'));


-- text字段的全文索引

SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2016_created_at on status2016 ( f_to_imts((data->>'created_at'))  );
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2015_created_at on status2015 ( f_to_imts((data->>'created_at'))  );
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2014_created_at on status2014 ( f_to_imts((data->>'created_at'))  );
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2013_created_at on status2013 ( f_to_imts((data->>'created_at'))  );
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2012_created_at on status2012 ( f_to_imts((data->>'created_at'))  );
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2011_created_at on status2011 ( f_to_imts((data->>'created_at'))  );
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2010_created_at on status2010 ( f_to_imts((data->>'created_at'))  );
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2009_created_at on status2009 ( f_to_imts((data->>'created_at'))  );
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2008_created_at on status2008 ( f_to_imts((data->>'created_at'))  );
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2007_created_at on status2007 ( f_to_imts((data->>'created_at'))  );
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2006_created_at on status2006 ( f_to_imts((data->>'created_at'))  );



SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2016_text on status2016 using gin(to_tsvector('testzhcfg', data->>'text'));
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2015_text on status2015 using gin(to_tsvector('testzhcfg', data->>'text'));
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2014_text on status2014 using gin(to_tsvector('testzhcfg', data->>'text'));
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2013_text on status2013 using gin(to_tsvector('testzhcfg', data->>'text'));
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2012_text on status2012 using gin(to_tsvector('testzhcfg', data->>'text'));
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2011_text on status2011 using gin(to_tsvector('testzhcfg', data->>'text'));
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2010_text on status2010 using gin(to_tsvector('testzhcfg', data->>'text'));
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2009_text on status2009 using gin(to_tsvector('testzhcfg', data->>'text'));
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2008_text on status2008 using gin(to_tsvector('testzhcfg', data->>'text'));
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2007_text on status2007 using gin(to_tsvector('testzhcfg', data->>'text'));
SET maintenance_work_mem = '1GB'; CREATE INDEX CONCURRENTLY status2006_text on status2006 using gin(to_tsvector('testzhcfg', data->>'text'));

– user id 索引

CREATE INDEX CONCURRENTLY index_status_2016_user_id ON status2016 ( ((data->'user'->>'id')::bigint) );
CREATE INDEX CONCURRENTLY index_status_2015_user_id ON status2015 ( ((data->'user'->>'id')::bigint) );
CREATE INDEX CONCURRENTLY index_status_2014_user_id ON status2014 ( ((data->'user'->>'id')::bigint) );
CREATE INDEX CONCURRENTLY index_status_2013_user_id ON status2013 ( ((data->'user'->>'id')::bigint) );
CREATE INDEX CONCURRENTLY index_status_2012_user_id ON status2012 ( ((data->'user'->>'id')::bigint) );
CREATE INDEX CONCURRENTLY index_status_2011_user_id ON status2011 ( ((data->'user'->>'id')::bigint) );
CREATE INDEX CONCURRENTLY index_status_2010_user_id ON status2010 ( ((data->'user'->>'id')::bigint) );
CREATE INDEX CONCURRENTLY index_status_2009_user_id ON status2009 ( ((data->'user'->>'id')::bigint) );
CREATE INDEX CONCURRENTLY index_status_2008_user_id ON status2008 ( ((data->'user'->>'id')::bigint) );
CREATE INDEX CONCURRENTLY index_status_2007_user_id ON status2007 ( ((data->'user'->>'id')::bigint) );
CREATE INDEX CONCURRENTLY index_status_2006_user_id ON status2006 ( ((data->'user'->>'id')::bigint) );

使用`user_id`索引 


explain SELECT (data->'user'->>'id')::bigint from status order by (data->'user'->>'id')::bigint asc limit 100;
                                                        QUERY PLAN                                                         
---------------------------------------------------------------------------------------------------------------------------
 Limit  (cost=3.40..24.43 rows=100 width=8)
   ->  Result  (cost=3.40..6920441.97 rows=32905725 width=8)
         ->  Merge Append  (cost=3.40..6262327.47 rows=32905725 width=199)
               Sort Key: ((((status.data -> 'user'::text) ->> 'id'::text))::bigint)
               ->  Sort  (cost=0.01..0.02 rows=1 width=32)
                     Sort Key: ((((status.data -> 'user'::text) ->> 'id'::text))::bigint)
                     ->  Seq Scan on status  (cost=0.00..0.00 rows=1 width=32)
               ->  Index Scan using user_id on status2016  (cost=0.44..4321159.24 rows=29321520 width=190)
               ->  Index Scan using index_status_2015_user_id on status2015  (cost=0.43..269118.91 rows=1850299 width=207)
               ->  Index Scan using index_status_2014_user_id on status2014  (cost=0.43..256224.11 rows=1072512 width=317)
               ->  Index Scan using index_status_2013_user_id on status2013  (cost=0.42..119684.35 rows=546128 width=356)
               ->  Index Scan using index_status_2012_user_id on status2012  (cost=0.29..28551.76 rows=93031 width=530)
               ->  Index Scan using index_status_2011_user_id on status2011  (cost=0.29..4556.18 rows=15193 width=513)
               ->  Index Scan using index_status_2010_user_id on status2010  (cost=0.28..896.41 rows=2942 width=512)
               ->  Index Scan using index_status_2009_user_id on status2009  (cost=0.14..20.42 rows=19 width=32)
               ->  Index Scan using index_status_2008_user_id on status2008  (cost=0.15..64.55 rows=1360 width=32)
               ->  Index Scan using index_status_2007_user_id on status2007  (cost=0.15..64.55 rows=1360 width=32)
               ->  Index Scan using index_status_2006_user_id on status2006  (cost=0.15..64.55 rows=1360 width=32)
(18 rows)

Time: 36.967 ms
explain SELECT (data->'user'->>'id')::bigint from status order by (data->'user'->>'id') asc limit 100;
                                           QUERY PLAN                                            
-------------------------------------------------------------------------------------------------
 Limit  (cost=3445800.51..3445800.76 rows=100 width=40)
   ->  Sort  (cost=3445800.51..3528064.83 rows=32905725 width=40)
         Sort Key: (((status.data -> 'user'::text) ->> 'id'::text))
         ->  Result  (cost=0.00..2188167.37 rows=32905725 width=40)
               ->  Append  (cost=0.00..1365524.24 rows=32905725 width=199)
                     ->  Seq Scan on status  (cost=0.00..0.00 rows=1 width=32)
                     ->  Seq Scan on status2016  (cost=0.00..1183150.20 rows=29321520 width=190)
                     ->  Seq Scan on status2015  (cost=0.00..73767.99 rows=1850299 width=207)
                     ->  Seq Scan on status2014  (cost=0.00..67815.12 rows=1072512 width=317)
                     ->  Seq Scan on status2013  (cost=0.00..31833.28 rows=546128 width=356)
                     ->  Seq Scan on status2012  (cost=0.00..7462.31 rows=93031 width=530)
                     ->  Seq Scan on status2011  (cost=0.00..1189.93 rows=15193 width=513)
                     ->  Seq Scan on status2010  (cost=0.00..231.42 rows=2942 width=512)
                     ->  Seq Scan on status2009  (cost=0.00..3.19 rows=19 width=32)
                     ->  Seq Scan on status2008  (cost=0.00..23.60 rows=1360 width=32)
                     ->  Seq Scan on status2007  (cost=0.00..23.60 rows=1360 width=32)
                     ->  Seq Scan on status2006  (cost=0.00..23.60 rows=1360 width=32)
(17 rows)

Time: 1.610 ms