test/whereN.test - external/github.com/sqlite/sqlite.git - Git at Google

 # 2024-04-02
 #
 # The author disclaims copyright to this source code.  In place of
 # a legal notice, here is a blessing:
 #
 #    May you do good and not evil.
 #    May you find forgiveness for yourself and forgive others.
 #    May you share freely, never taking more than you give.
 #
 #***********************************************************************
 # Tests for the whereInterstageHeuristic() routine in the query planner.
 #

 set testdir [file dirname $argv0]
 source $testdir/tester.tcl
 set testprefix whereN

 # The following is a simplified and "sanitized" version of the original
 # real-world query that brought the problem to light.
 #
 # The issue is a slow query.  The answer is correct, but it was taking too
 # much time, because it was doing a full table scan rather than an indexed
 # lookup.
 #
 # The problem was that the query planner was overestimating the number of
 # output rows.  The estimated number of output rows is accurate if the
 # DSNAME parameter is "ds-one".  In that case, a large fraction of the rows
 # in "violation" end up being output.  The query planner correctly deduces
 # that it is faster to do a full table scan of the large "violation" table
 # to avoid the after-query sort that implements the ORDER BY clause. However,
 # if the DSNAME is "ds-two", then only a few rows (about 6) are generated,
 # and it is much much faster to do an indexed lookup of "violation" followed
 # by a sort operation to implement ORDER BY
 #
 # The problem, of course, is that the query planner has no way of knowing
 # in advance how many rows will be generated.  The query planner tries to
 # estimate a worst case, which is a large number of output rows, and it picks
 # the best plan for that case.  However, the plan choosen is very inefficient
 # when the number of output rows is small.
 #
 # The whereInterstageHeuristic() routine in the query planner attempts to
 # correct this by adjusting the query plan such that it avoids the very bad
 # query plan for a small number of rows, at the expense of a slightly less
 # efficient plan for a large number of rows.  The large number of rows case
 # is perhaps 5% slower with the revised plan, but the small number of
 # rows case is around 100 times faster.  That seems like a good tradeoff.
 #
 do_execsql_test 1.0 {
   CREATE TABLE datasource(dsid INT, name TEXT);
   INSERT INTO datasource VALUES(1,'ds-one'),(2,'ds-two'),(3,'ds-three');
   CREATE INDEX ds1 ON datasource(name, dsid);

   CREATE TABLE rule(rid INT, team_id INT, dsid INT);
   WITH RECURSIVE c(n) AS (VALUES(1) UNION ALL SELECT n+1 FROM c WHERE n<9)
     INSERT INTO rule(rid,team_id,dsid) SELECT n, 1, 1 FROM c;
   WITH RECURSIVE c(n) AS (VALUES(10) UNION ALL SELECT n+1 FROM c WHERE n<24)
     INSERT INTO rule(rid,team_id,dsid) SELECT n, 2, 2 FROM c;
   CREATE INDEX rule2 ON rule(dsid, rid);

   CREATE TABLE violation(vid INT, rid INT, vx BLOB);
   /***  Uncomment to insert actual data
   WITH src(rid, cnt) AS (VALUES(1,3586),(2,1343),(3,6505),(5,76230),
                                (6,740),(7,287794),(8,457),(12,1),
                                (14,1),(16,1),(17,1),(18,1),(19,1))
     INSERT INTO violation(vid, rid, vx)
       SELECT rid*1000000+value, rid, randomblob(15)
         FROM src, generate_series(1,cnt);
   ***/
   CREATE INDEX v1 ON violation(rid, vid);
   CREATE INDEX v2 ON violation(vid);
   ANALYZE;
   DELETE FROM sqlite_stat1;
   DROP TABLE IF EXISTS sqlite_stat4;
   INSERT INTO sqlite_stat1 VALUES
     ('violation','v2','376661 1'),
     ('violation','v1','376661 28974 1'),
     ('rule','rule2','24 12 1'),
     ('datasource','ds1','3 1 1');
   ANALYZE sqlite_schema;
 }
 set DSNAME ds-two   ;#  Only a few rows.  Change to "ds-one" for many rows.
 do_eqp_test 1.1 {
   SELECT count(*), length(group_concat(vx)) FROM (
     SELECT V.*
       FROM datasource DS, rule R, violation V
      WHERE V.rid=R.rid
        AND R.dsid=DS.dsid
        AND DS.name=$DSNAME
      ORDER BY V.vid desc
   );
 } {
   QUERY PLAN
   |--CO-ROUTINE (subquery-xxxxxx)
   |  |--SEARCH DS USING COVERING INDEX ds1 (name=?)
   |  |--SEARCH R USING COVERING INDEX rule2 (dsid=?)
   |  |--SEARCH V USING INDEX v1 (rid=?)
   |  `--USE TEMP B-TREE FOR ORDER BY
   `--SCAN (subquery-xxxxxx)
 }
 #       ^^^^---- We want to see three SEARCH terms.  No SCAN terms.
 #                The ORDER BY is implemented by a separate sorter pass.

 finish_test
	# 2024-04-02
	#
	# The author disclaims copyright to this source code. In place of
	# a legal notice, here is a blessing:
	#
	# May you do good and not evil.
	# May you find forgiveness for yourself and forgive others.
	# May you share freely, never taking more than you give.
	#
	#***********************************************************************
	# Tests for the whereInterstageHeuristic() routine in the query planner.
	#

	set testdir [file dirname $argv0]
	source $testdir/tester.tcl
	set testprefix whereN

	# The following is a simplified and "sanitized" version of the original
	# real-world query that brought the problem to light.
	#
	# The issue is a slow query. The answer is correct, but it was taking too
	# much time, because it was doing a full table scan rather than an indexed
	# lookup.
	#
	# The problem was that the query planner was overestimating the number of
	# output rows. The estimated number of output rows is accurate if the
	# DSNAME parameter is "ds-one". In that case, a large fraction of the rows
	# in "violation" end up being output. The query planner correctly deduces
	# that it is faster to do a full table scan of the large "violation" table
	# to avoid the after-query sort that implements the ORDER BY clause. However,
	# if the DSNAME is "ds-two", then only a few rows (about 6) are generated,
	# and it is much much faster to do an indexed lookup of "violation" followed
	# by a sort operation to implement ORDER BY
	#
	# The problem, of course, is that the query planner has no way of knowing
	# in advance how many rows will be generated. The query planner tries to
	# estimate a worst case, which is a large number of output rows, and it picks
	# the best plan for that case. However, the plan choosen is very inefficient
	# when the number of output rows is small.
	#
	# The whereInterstageHeuristic() routine in the query planner attempts to
	# correct this by adjusting the query plan such that it avoids the very bad
	# query plan for a small number of rows, at the expense of a slightly less
	# efficient plan for a large number of rows. The large number of rows case
	# is perhaps 5% slower with the revised plan, but the small number of
	# rows case is around 100 times faster. That seems like a good tradeoff.
	#
	do_execsql_test 1.0 {
	CREATE TABLE datasource(dsid INT, name TEXT);
	INSERT INTO datasource VALUES(1,'ds-one'),(2,'ds-two'),(3,'ds-three');
	CREATE INDEX ds1 ON datasource(name, dsid);

	CREATE TABLE rule(rid INT, team_id INT, dsid INT);
	WITH RECURSIVE c(n) AS (VALUES(1) UNION ALL SELECT n+1 FROM c WHERE n<9)
	INSERT INTO rule(rid,team_id,dsid) SELECT n, 1, 1 FROM c;
	WITH RECURSIVE c(n) AS (VALUES(10) UNION ALL SELECT n+1 FROM c WHERE n<24)
	INSERT INTO rule(rid,team_id,dsid) SELECT n, 2, 2 FROM c;
	CREATE INDEX rule2 ON rule(dsid, rid);

	CREATE TABLE violation(vid INT, rid INT, vx BLOB);
	/*** Uncomment to insert actual data
	WITH src(rid, cnt) AS (VALUES(1,3586),(2,1343),(3,6505),(5,76230),
	(6,740),(7,287794),(8,457),(12,1),
	(14,1),(16,1),(17,1),(18,1),(19,1))
	INSERT INTO violation(vid, rid, vx)
	SELECT rid*1000000+value, rid, randomblob(15)
	FROM src, generate_series(1,cnt);
	***/
	CREATE INDEX v1 ON violation(rid, vid);
	CREATE INDEX v2 ON violation(vid);
	ANALYZE;
	DELETE FROM sqlite_stat1;
	DROP TABLE IF EXISTS sqlite_stat4;
	INSERT INTO sqlite_stat1 VALUES
	('violation','v2','376661 1'),
	('violation','v1','376661 28974 1'),
	('rule','rule2','24 12 1'),
	('datasource','ds1','3 1 1');
	ANALYZE sqlite_schema;
	}
	set DSNAME ds-two ;# Only a few rows. Change to "ds-one" for many rows.
	do_eqp_test 1.1 {
	SELECT count(*), length(group_concat(vx)) FROM (
	SELECT V.*
	FROM datasource DS, rule R, violation V
	WHERE V.rid=R.rid
	AND R.dsid=DS.dsid
	AND DS.name=$DSNAME
	ORDER BY V.vid desc
	);
	} {
	QUERY PLAN
	\|--CO-ROUTINE (subquery-xxxxxx)
	\| \|--SEARCH DS USING COVERING INDEX ds1 (name=?)
	\| \|--SEARCH R USING COVERING INDEX rule2 (dsid=?)
	\| \|--SEARCH V USING INDEX v1 (rid=?)
	\| `--USE TEMP B-TREE FOR ORDER BY
	`--SCAN (subquery-xxxxxx)
	}
	# ^^^^---- We want to see three SEARCH terms. No SCAN terms.
	# The ORDER BY is implemented by a separate sorter pass.

	finish_test