Grouping on any one of multiple columns in Postgres

Question 1

Is it possible to create some sort of chain of grouping in Postgres? Let's say I have the following chart:

CREATE TABLE foo AS
SELECT row_number() OVER () AS id, *
FROM ( VALUES
 ( 'X', 'D', 'G', 'P' ),
 ( 'F', 'D', 'L', 'M' ),
 ( 'X', 'N', 'R', 'S' ),
 ( 'Y', 'I', 'W', NULL ),
 ( 'U', 'Z', 'E', NULL )
) AS f(a,b,c,d);
id | a | b | c | d
------------------
 1 | X | D | G | P
 2 | F | D | L | M
 3 | X | N | R | S
 4 | Y | I | W | 
 5 | U | Z | E |

I want to somehow craft a GROUP BY that yields three groups:

1, 2 and 3 together
- 1 and 2 because of a common D in the b column
- 1 and 3 because of a common X in the a column
4 alone (no common values in any of the columns; nulls shouldn't match)
5 alone (no common values in any of the columns; nulls shouldn't match)

I'm currently using Postgres 9.5, but we'll upgrade eventually to 9.6, so if there's anything in there that'll help me, I'm open to hearing it.

In other words, I'm looking for something like (let's say I used array_agg(DISTINCT a), etc. to keep display simpler):

 ids | as | bs | cs | ds
-----------------------------------------------------------------------
{1, 2, 3} | {'X', 'F'} | {'D', 'N'} | {'G', 'L', 'R'} | {'P', 'M', 'S'}
{4} | {'Y'} | {'I'} | {'W'} | {NULL}
{5} | {'U'} | {'Z'} | {'E'} | {NULL}

(I'm not exactly sure how the nulls would display so don't get too hung up on that; the important point is that they shouldn't match each other.)

When I use GROUP BY CUBE (a, b, c, d), I get way more than three results...ditto GROUP BY ROLLUP and GROUP BY GROUPING SETS.

Is there an elegant way in Postgres? I can imagine how you would do it in Ruby via Active Record (loop through every record, group it with previous grouped sets that match), but I'd like to keep this in Postgres if possible.

Question 2

Another recursive solution that:

first creates the adjacency list of the connected graph of ids,
then finds its transitive closure (this is the recursive part)
and then groups by (once) to find the connected components that each node belongs to
and joins to the table again and group by (again) to gather the values from all nodes of each connected component.

Initial data (copied from Jack Douglas' solution):

begin;
create schema stack;
set search_path=stack;
create table foo as
select *
from (values (1,'X','D','G','P')
 , (2,'F','D','L','M')
 , (3,'X','N','R','S')
 , (4,'Y','I','W',null)
 , (5,'U','Z','E',null) ) AS f(id,a,b,c,d);

The query:

with recursive 
 al (tail, head) as -- adjacency list 
 ( select f.id, g.id 
 from foo as f join foo as g
 on (f.a = g.a or f.b = g.b or f.c = g.c or f.d = g.d) 
 ),
 tc (tail, head) as -- transitive closure
 ( select * from al
 union distinct
 select f.tail, g.head 
 from al as f join tc as g on f.head = g.tail
 ) ,
 cc (head, ids) as -- group once
 ( select head, array_agg(distinct tail order by tail) as ids
 from tc
 group by head
 ) 
select -- group twice
 ids,
 array_agg(distinct a order by a) as a,
 array_agg(distinct b order by b) as b,
 array_agg(distinct c order by c) as c,
 array_agg(distinct d order by d) as d
from
 cc join foo on cc.head = foo.id
group by ids ;

┌─────────┬───────┬───────┬─────────┬─────────┐
│ ids │ a │ b │ c │ d │
├─────────┼───────┼───────┼─────────┼─────────┤
│ {1,2,3} │ {F,X} │ {D,N} │ {G,L,R} │ {M,P,S} │
│ {4} │ {Y} │ {I} │ {W} │ {NULL} │
│ {5} │ {U} │ {Z} │ {E} │ {NULL} │
└─────────┴───────┴───────┴─────────┴─────────┘

Cleanup:

rollback;

Question 3

Assuming you are after a general solution, I don't think there is any non-recursive method of solving your problem. If your real-world problem needs to work with large numbers of rows you may have your work cut out getting a solution that scales well enough.

test schema and data:

begin;
create schema stack;
set search_path=stack;
create table foo as
select *
from (values (1,'X','D','G','P')
 , (2,'F','D','L','M')
 , (3,'X','N','R','S')
 , (4,'Y','I','W',null)
 , (5,'U','Z','E',null) ) AS f(id,a,b,c,d);

solution:

with recursive t(id,a,b,c,d,start,path,cycle) as (
 select *, id, array[id], false from foo
 union all
 select f.*, start, path||f.id, f.id=any(path)
 from foo f join t 
 on f.id<>t.id and
 (f.a=t.a or f.b=t.b or f.c=t.c or f.d=t.d) where not cycle )
select array_agg(f.id order by f.id) ids
 , array_agg(distinct a order by a) a
 , array_agg(distinct b order by b) b
 , array_agg(distinct c order by c) c
 , array_agg(distinct d order by d) d
from foo f join ( select start id, array_agg(id order by id) ids
 from t
 where not cycle group by start) z on z.id=f.id
group by ids::text;

┌─────────┬───────┬───────┬─────────┬─────────┐
│ ids │ a │ b │ c │ d │
├─────────┼───────┼───────┼─────────┼─────────┤
│ {1,2,3} │ {F,X} │ {D,N} │ {G,L,R} │ {M,P,S} │
│ {4} │ {Y} │ {I} │ {W} │ {NULL} │
│ {5} │ {U} │ {Z} │ {E} │ {NULL} │
└─────────┴───────┴───────┴─────────┴─────────┘

cleanup:

rollback;

ypercubeTM ypercubeTM 99.7k13 gold badges217 silver badges306 bronze badges · Answer 1 · 2016-12-10 01:25:56Z

Another recursive solution that:

first creates the adjacency list of the connected graph of ids,
then finds its transitive closure (this is the recursive part)
and then groups by (once) to find the connected components that each node belongs to
and joins to the table again and group by (again) to gather the values from all nodes of each connected component.

Initial data (copied from Jack Douglas' solution):

begin;
create schema stack;
set search_path=stack;
create table foo as
select *
from (values (1,'X','D','G','P')
 , (2,'F','D','L','M')
 , (3,'X','N','R','S')
 , (4,'Y','I','W',null)
 , (5,'U','Z','E',null) ) AS f(id,a,b,c,d);

The query:

with recursive 
 al (tail, head) as -- adjacency list 
 ( select f.id, g.id 
 from foo as f join foo as g
 on (f.a = g.a or f.b = g.b or f.c = g.c or f.d = g.d) 
 ),
 tc (tail, head) as -- transitive closure
 ( select * from al
 union distinct
 select f.tail, g.head 
 from al as f join tc as g on f.head = g.tail
 ) ,
 cc (head, ids) as -- group once
 ( select head, array_agg(distinct tail order by tail) as ids
 from tc
 group by head
 ) 
select -- group twice
 ids,
 array_agg(distinct a order by a) as a,
 array_agg(distinct b order by b) as b,
 array_agg(distinct c order by c) as c,
 array_agg(distinct d order by d) as d
from
 cc join foo on cc.head = foo.id
group by ids ;

┌─────────┬───────┬───────┬─────────┬─────────┐
│ ids │ a │ b │ c │ d │
├─────────┼───────┼───────┼─────────┼─────────┤
│ {1,2,3} │ {F,X} │ {D,N} │ {G,L,R} │ {M,P,S} │
│ {4} │ {Y} │ {I} │ {W} │ {NULL} │
│ {5} │ {U} │ {Z} │ {E} │ {NULL} │
└─────────┴───────┴───────┴─────────┴─────────┘

Cleanup:

rollback;

Jack Douglas Jack Douglas 40.6k16 gold badges106 silver badges179 bronze badges · Answer 2 · 2016-12-09 19:11:26Z

Assuming you are after a general solution, I don't think there is any non-recursive method of solving your problem. If your real-world problem needs to work with large numbers of rows you may have your work cut out getting a solution that scales well enough.

test schema and data:

begin;
create schema stack;
set search_path=stack;
create table foo as
select *
from (values (1,'X','D','G','P')
 , (2,'F','D','L','M')
 , (3,'X','N','R','S')
 , (4,'Y','I','W',null)
 , (5,'U','Z','E',null) ) AS f(id,a,b,c,d);

solution:

with recursive t(id,a,b,c,d,start,path,cycle) as (
 select *, id, array[id], false from foo
 union all
 select f.*, start, path||f.id, f.id=any(path)
 from foo f join t 
 on f.id<>t.id and
 (f.a=t.a or f.b=t.b or f.c=t.c or f.d=t.d) where not cycle )
select array_agg(f.id order by f.id) ids
 , array_agg(distinct a order by a) a
 , array_agg(distinct b order by b) b
 , array_agg(distinct c order by c) c
 , array_agg(distinct d order by d) d
from foo f join ( select start id, array_agg(id order by id) ids
 from t
 where not cycle group by start) z on z.id=f.id
group by ids::text;

┌─────────┬───────┬───────┬─────────┬─────────┐
│ ids │ a │ b │ c │ d │
├─────────┼───────┼───────┼─────────┼─────────┤
│ {1,2,3} │ {F,X} │ {D,N} │ {G,L,R} │ {M,P,S} │
│ {4} │ {Y} │ {I} │ {W} │ {NULL} │
│ {5} │ {U} │ {Z} │ {E} │ {NULL} │
└─────────┴───────┴───────┴─────────┴─────────┘

cleanup:

rollback;

Stack Exchange Network

Grouping on any one of multiple columns in Postgres

2 Answers 2

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Linked

Hot Network Questions

Grouping on any one of multiple columns in Postgres

2 Answers 2

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Linked

Related

Hot Network Questions