You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
8742 lines
322 KiB
8742 lines
322 KiB
/* Copyright (c) 2000, 2019, Oracle and/or its affiliates. All rights reserved.
|
|
|
|
This program is free software; you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License, version 2.0,
|
|
as published by the Free Software Foundation.
|
|
|
|
This program is also distributed with certain software (including
|
|
but not limited to OpenSSL) that is licensed under separate terms,
|
|
as designated in a particular file or component or in included license
|
|
documentation. The authors of MySQL hereby grant you an additional
|
|
permission to link the program and your derivative works with the
|
|
separately licensed software that they have included with MySQL.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License, version 2.0, for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program; if not, write to the Free Software
|
|
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
|
|
|
|
/**
|
|
@file
|
|
|
|
@brief
|
|
Query execution
|
|
|
|
|
|
@defgroup Query_Executor Query Executor
|
|
@{
|
|
*/
|
|
|
|
#include "sql/sql_executor.h"
|
|
|
|
#include <stdint.h>
|
|
#include <algorithm>
|
|
#include <atomic>
|
|
#include <cmath>
|
|
#include <cstring>
|
|
#include <map>
|
|
#include <memory>
|
|
#include <new>
|
|
#include <string>
|
|
#include <utility>
|
|
#include <vector>
|
|
|
|
#include "lex_string.h"
|
|
#include "m_ctype.h"
|
|
#include "map_helpers.h"
|
|
#include "memory_debugging.h"
|
|
#include "memroot_allocator.h"
|
|
#include "my_alloc.h"
|
|
#include "my_bitmap.h"
|
|
#include "my_byteorder.h"
|
|
#include "my_dbug.h"
|
|
#include "my_loglevel.h"
|
|
#include "my_macros.h"
|
|
#include "my_pointer_arithmetic.h"
|
|
#include "my_sqlcommand.h"
|
|
#include "my_sys.h"
|
|
#include "my_table_map.h"
|
|
#include "mysql/components/services/log_builtins.h"
|
|
#include "mysql/service_mysql_alloc.h"
|
|
#include "mysql_com.h"
|
|
#include "mysqld_error.h"
|
|
#include "sql/basic_row_iterators.h"
|
|
#include "sql/composite_iterators.h"
|
|
#include "sql/current_thd.h"
|
|
#include "sql/debug_sync.h" // DEBUG_SYNC
|
|
#include "sql/enum_query_type.h"
|
|
#include "sql/field.h"
|
|
#include "sql/filesort.h" // Filesort
|
|
#include "sql/handler.h"
|
|
#include "sql/hash_join_iterator.h"
|
|
#include "sql/item_cmpfunc.h"
|
|
#include "sql/item_func.h"
|
|
#include "sql/item_sum.h" // Item_sum
|
|
#include "sql/json_dom.h" // Json_wrapper
|
|
#include "sql/key.h" // key_cmp
|
|
#include "sql/key_spec.h"
|
|
#include "sql/mem_root_array.h"
|
|
#include "sql/mysqld.h" // stage_executing
|
|
#include "sql/nested_join.h"
|
|
#include "sql/opt_explain_format.h"
|
|
#include "sql/opt_range.h" // QUICK_SELECT_I
|
|
#include "sql/opt_trace.h" // Opt_trace_object
|
|
#include "sql/opt_trace_context.h"
|
|
#include "sql/parse_tree_nodes.h" // PT_frame
|
|
#include "sql/pfs_batch_mode.h"
|
|
#include "sql/protocol.h"
|
|
#include "sql/psi_memory_key.h"
|
|
#include "sql/query_options.h"
|
|
#include "sql/query_result.h" // Query_result
|
|
#include "sql/record_buffer.h" // Record_buffer
|
|
#include "sql/ref_row_iterators.h"
|
|
#include "sql/row_iterator.h"
|
|
#include "sql/sorting_iterator.h"
|
|
#include "sql/sql_base.h" // fill_record
|
|
#include "sql/sql_bitmap.h"
|
|
#include "sql/sql_error.h"
|
|
#include "sql/sql_join_buffer.h" // CACHE_FIELD
|
|
#include "sql/sql_list.h"
|
|
#include "sql/sql_optimizer.h" // JOIN
|
|
#include "sql/sql_select.h"
|
|
#include "sql/sql_tmp_table.h" // create_tmp_table
|
|
#include "sql/system_variables.h"
|
|
#include "sql/table_function.h"
|
|
#include "sql/temp_table_param.h" // Memroot_vector
|
|
#include "sql/thr_malloc.h"
|
|
#include "sql/timing_iterator.h"
|
|
#include "sql/window.h"
|
|
#include "sql/window_lex.h"
|
|
#include "sql_string.h"
|
|
#include "template_utils.h"
|
|
#include "thr_lock.h"
|
|
|
|
using std::max;
|
|
using std::min;
|
|
using std::string;
|
|
using std::unique_ptr;
|
|
using std::vector;
|
|
|
|
static void return_zero_rows(JOIN *join, List<Item> &fields);
|
|
static int do_select(JOIN *join);
|
|
|
|
static enum_nested_loop_state evaluate_join_record(JOIN *join,
|
|
QEP_TAB *qep_tab);
|
|
static enum_nested_loop_state evaluate_null_complemented_join_record(
|
|
JOIN *join, QEP_TAB *qep_tab);
|
|
static enum_nested_loop_state end_send(JOIN *join, QEP_TAB *qep_tab,
|
|
bool end_of_records);
|
|
static enum_nested_loop_state end_write(JOIN *join, QEP_TAB *qep_tab,
|
|
bool end_of_records);
|
|
static enum_nested_loop_state end_write_wf(JOIN *join, QEP_TAB *qep_tab,
|
|
bool end_of_records);
|
|
static enum_nested_loop_state end_update(JOIN *join, QEP_TAB *qep_tab,
|
|
bool end_of_records);
|
|
|
|
static int read_system(TABLE *table);
|
|
static int read_const(TABLE *table, TABLE_REF *ref);
|
|
static bool remove_dup_with_compare(THD *thd, TABLE *entry, Field **field,
|
|
ulong offset, Item *having);
|
|
static bool remove_dup_with_hash_index(THD *thd, TABLE *table,
|
|
Field **first_field,
|
|
const size_t *field_lengths,
|
|
size_t key_length, Item *having);
|
|
static int do_sj_reset(SJ_TMP_TABLE *sj_tbl);
|
|
static bool alloc_group_fields(JOIN *join, ORDER *group);
|
|
static void SetCostOnTableIterator(const Cost_model_server &cost_model,
|
|
const POSITION *pos, bool is_after_filter,
|
|
RowIterator *iterator);
|
|
|
|
/**
|
|
Evaluates HAVING condition
|
|
@returns true if TRUE, false if FALSE or NULL
|
|
@note this uses val_int() and relies on the convention that val_int()
|
|
returns 0 when the value is NULL.
|
|
*/
|
|
static bool having_is_true(Item *h) {
|
|
if (h == nullptr) {
|
|
DBUG_PRINT("info", ("no HAVING"));
|
|
return true;
|
|
}
|
|
bool rc = h->val_int();
|
|
DBUG_PRINT("info", ("HAVING is %d", (int)rc));
|
|
return rc;
|
|
}
|
|
|
|
/// Maximum amount of space (in bytes) to allocate for a Record_buffer.
|
|
static constexpr size_t MAX_RECORD_BUFFER_SIZE = 128 * 1024; // 128KB
|
|
|
|
namespace {
|
|
|
|
string RefToString(const TABLE_REF &ref, const KEY *key, bool include_nulls) {
|
|
string ret;
|
|
|
|
const uchar *key_buff = ref.key_buff;
|
|
|
|
for (unsigned key_part_idx = 0; key_part_idx < ref.key_parts;
|
|
++key_part_idx) {
|
|
if (key_part_idx != 0) {
|
|
ret += ", ";
|
|
}
|
|
const Field *field = key->key_part[key_part_idx].field;
|
|
if (field->is_field_for_functional_index()) {
|
|
// Do not print out the column name if the column represents a functional
|
|
// index. Instead, print out the indexed expression.
|
|
ret += ItemToString(field->gcol_info->expr_item);
|
|
} else {
|
|
DBUG_ASSERT(!field->is_hidden_from_user());
|
|
ret += field->field_name;
|
|
}
|
|
ret += "=";
|
|
ret += ItemToString(ref.items[key_part_idx]);
|
|
|
|
// If we have ref_or_null access, find out if this keypart is the one that
|
|
// is -or-NULL (there's always only a single one).
|
|
if (include_nulls && key_buff == ref.null_ref_key) {
|
|
ret += " or NULL";
|
|
}
|
|
key_buff += key->key_part[key_part_idx].store_length;
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
} // namespace
|
|
|
|
/**
|
|
Execute select, executor entry point.
|
|
|
|
@todo
|
|
When can we have here thd->net.report_error not zero?
|
|
|
|
@note that EXPLAIN may come here (single-row derived table, uncorrelated
|
|
scalar subquery in WHERE clause...).
|
|
*/
|
|
|
|
void JOIN::exec() {
|
|
Opt_trace_context *const trace = &thd->opt_trace;
|
|
Opt_trace_object trace_wrapper(trace);
|
|
Opt_trace_object trace_exec(trace, "join_execution");
|
|
trace_exec.add_select_number(select_lex->select_number);
|
|
Opt_trace_array trace_steps(trace, "steps");
|
|
List<Item> *columns_list = &fields_list;
|
|
DBUG_TRACE;
|
|
|
|
DBUG_ASSERT(select_lex == thd->lex->current_select());
|
|
|
|
/*
|
|
Check that we either
|
|
- have no tables, or
|
|
- have tables and have locked them, or
|
|
- called for fake_select_lex, which may have temporary tables which do
|
|
not need locking up front.
|
|
*/
|
|
DBUG_ASSERT(!tables || thd->lex->is_query_tables_locked() ||
|
|
select_lex == unit->fake_select_lex);
|
|
|
|
THD_STAGE_INFO(thd, stage_executing);
|
|
DEBUG_SYNC(thd, "before_join_exec");
|
|
|
|
set_executed();
|
|
|
|
if (prepare_result()) return;
|
|
|
|
if (m_windows.elements > 0 && !m_windowing_steps) {
|
|
// Initialize state of window functions as end_write_wf() will be shortcut
|
|
for (Window &w : m_windows) {
|
|
w.reset_all_wf_state();
|
|
}
|
|
}
|
|
|
|
Query_result *const query_result = select_lex->query_result();
|
|
|
|
do_send_rows = unit->select_limit_cnt > 0;
|
|
|
|
if (!tables_list &&
|
|
(tables || !select_lex->with_sum_func)) { // Only test of functions
|
|
/*
|
|
We have to test for 'conds' here as the WHERE may not be constant
|
|
even if we don't have any tables for prepared statements or if
|
|
conds uses something like 'rand()'.
|
|
|
|
Don't evaluate the having clause here. return_zero_rows() should
|
|
be called only for cases where there are no matching rows after
|
|
evaluating all conditions except the HAVING clause.
|
|
*/
|
|
if (select_lex->cond_value != Item::COND_FALSE &&
|
|
(!where_cond || where_cond->val_int())) {
|
|
if (query_result->send_result_set_metadata(
|
|
thd, *columns_list, Protocol::SEND_NUM_ROWS | Protocol::SEND_EOF))
|
|
return;
|
|
|
|
/*
|
|
If the HAVING clause is either impossible or always true, then
|
|
JOIN::having is set to NULL by optimize_cond.
|
|
In this case JOIN::exec must check for JOIN::having_value, in the
|
|
same way it checks for JOIN::cond_value.
|
|
*/
|
|
if (((select_lex->having_value != Item::COND_FALSE) &&
|
|
having_is_true(having_cond)) &&
|
|
should_send_current_row() &&
|
|
query_result->send_data(thd, fields_list))
|
|
error = 1;
|
|
else {
|
|
error = (int)query_result->send_eof(thd);
|
|
send_records = calc_found_rows ? 1 : thd->get_sent_row_count();
|
|
}
|
|
/* Query block (without union) always returns 0 or 1 row */
|
|
thd->current_found_rows = send_records;
|
|
} else {
|
|
return_zero_rows(this, *columns_list);
|
|
}
|
|
return;
|
|
}
|
|
|
|
if (zero_result_cause) {
|
|
return_zero_rows(this, *columns_list);
|
|
return;
|
|
}
|
|
|
|
/*
|
|
Initialize examined rows here because the values from all join parts
|
|
must be accumulated in examined_row_count. Hence every join
|
|
iteration must count from zero.
|
|
*/
|
|
examined_rows = 0;
|
|
|
|
/* XXX: When can we have here thd->is_error() not zero? */
|
|
if (thd->is_error()) {
|
|
error = thd->is_error();
|
|
return;
|
|
}
|
|
|
|
DBUG_PRINT("info", ("%s", thd->proc_info));
|
|
if (query_result->send_result_set_metadata(
|
|
thd, *fields, Protocol::SEND_NUM_ROWS | Protocol::SEND_EOF)) {
|
|
/* purecov: begin inspected */
|
|
error = 1;
|
|
return;
|
|
/* purecov: end */
|
|
}
|
|
error = do_select(this);
|
|
/* Accumulate the counts from all join iterations of all join parts. */
|
|
thd->inc_examined_row_count(examined_rows);
|
|
DBUG_PRINT("counts", ("thd->examined_row_count: %lu",
|
|
(ulong)thd->get_examined_row_count()));
|
|
}
|
|
|
|
bool JOIN::create_intermediate_table(QEP_TAB *const tab,
|
|
List<Item> *tmp_table_fields,
|
|
ORDER_with_src &tmp_table_group,
|
|
bool save_sum_fields) {
|
|
DBUG_TRACE;
|
|
THD_STAGE_INFO(thd, stage_creating_tmp_table);
|
|
const bool windowing = m_windows.elements > 0;
|
|
/*
|
|
Pushing LIMIT to the temporary table creation is not applicable
|
|
when there is ORDER BY or GROUP BY or aggregate/window functions, because
|
|
in all these cases we need all result rows.
|
|
*/
|
|
ha_rows tmp_rows_limit =
|
|
((order == NULL || skip_sort_order) && !tmp_table_group && !windowing &&
|
|
!select_lex->with_sum_func)
|
|
? m_select_limit
|
|
: HA_POS_ERROR;
|
|
|
|
tab->tmp_table_param = new (thd->mem_root) Temp_table_param(tmp_table_param);
|
|
tab->tmp_table_param->skip_create_table = true;
|
|
|
|
bool distinct_arg =
|
|
select_distinct &&
|
|
// GROUP BY is absent or has been done in a previous step
|
|
!group_list &&
|
|
// We can only do DISTINCT in last window's tmp table step
|
|
(!windowing || (tab->tmp_table_param->m_window &&
|
|
tab->tmp_table_param->m_window->is_last()));
|
|
|
|
TABLE *table =
|
|
create_tmp_table(thd, tab->tmp_table_param, *tmp_table_fields,
|
|
tmp_table_group, distinct_arg, save_sum_fields,
|
|
select_lex->active_options(), tmp_rows_limit, "");
|
|
if (!table) return true;
|
|
tmp_table_param.using_outer_summary_function =
|
|
tab->tmp_table_param->using_outer_summary_function;
|
|
|
|
DBUG_ASSERT(tab->idx() > 0);
|
|
tab[-1].next_select = sub_select_op;
|
|
if (!(tab->op = new (thd->mem_root) QEP_tmp_table(tab))) goto err;
|
|
|
|
tab->set_table(table);
|
|
tab->set_temporary_table_deduplicates(distinct_arg ||
|
|
tmp_table_group != nullptr);
|
|
|
|
/**
|
|
If this is a window's OUT table, any final DISTINCT, ORDER BY will lead to
|
|
windows showing use of tmp table in the final windowing step, so no
|
|
need to signal use of tmp table unless we are here for another tmp table.
|
|
*/
|
|
if (!tab->tmp_table_param->m_window) {
|
|
if (table->group)
|
|
explain_flags.set(tmp_table_group.src, ESP_USING_TMPTABLE);
|
|
else if (table->is_distinct || select_distinct)
|
|
explain_flags.set(ESC_DISTINCT, ESP_USING_TMPTABLE);
|
|
else {
|
|
/*
|
|
Try to find a reason for this table, to show in EXPLAIN.
|
|
If there's no GROUP BY, no ORDER BY, no DISTINCT, it must be just a
|
|
result buffer. If there's ORDER BY but there is also windowing
|
|
then ORDER BY happens after windowing, and here we are before
|
|
windowing, so the table is not for ORDER BY either.
|
|
*/
|
|
if ((!group_list && (!order || windowing) && !select_distinct) ||
|
|
(select_lex->active_options() &
|
|
(SELECT_BIG_RESULT | OPTION_BUFFER_RESULT)))
|
|
explain_flags.set(ESC_BUFFER_RESULT, ESP_USING_TMPTABLE);
|
|
}
|
|
}
|
|
/* if group or order on first table, sort first */
|
|
if (group_list && simple_group) {
|
|
DBUG_PRINT("info", ("Sorting for group"));
|
|
|
|
if (m_ordered_index_usage != ORDERED_INDEX_GROUP_BY &&
|
|
add_sorting_to_table(const_tables, &group_list))
|
|
goto err;
|
|
|
|
if (alloc_group_fields(this, group_list)) goto err;
|
|
if (make_sum_func_list(all_fields, fields_list, true)) goto err;
|
|
const bool need_distinct =
|
|
!(tab->quick() && tab->quick()->is_agg_loose_index_scan());
|
|
if (prepare_sum_aggregators(sum_funcs, need_distinct)) goto err;
|
|
if (setup_sum_funcs(thd, sum_funcs)) goto err;
|
|
group_list = NULL;
|
|
} else {
|
|
if (make_sum_func_list(all_fields, fields_list, false)) goto err;
|
|
const bool need_distinct =
|
|
!(tab->quick() && tab->quick()->is_agg_loose_index_scan());
|
|
if (prepare_sum_aggregators(sum_funcs, need_distinct)) goto err;
|
|
if (setup_sum_funcs(thd, sum_funcs)) goto err;
|
|
|
|
if (!group_list && !table->is_distinct && order && simple_order &&
|
|
!m_windows_sort) {
|
|
DBUG_PRINT("info", ("Sorting for order"));
|
|
|
|
if (m_ordered_index_usage != ORDERED_INDEX_ORDER_BY &&
|
|
add_sorting_to_table(const_tables, &order))
|
|
goto err;
|
|
order = NULL;
|
|
}
|
|
}
|
|
return false;
|
|
|
|
err:
|
|
if (table != NULL) {
|
|
free_tmp_table(thd, table);
|
|
tab->set_table(NULL);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
Send all rollup levels higher than the current one to the client.
|
|
|
|
@b SAMPLE
|
|
@code
|
|
SELECT a, b, c SUM(b) FROM t1 GROUP BY a,b WITH ROLLUP
|
|
@endcode
|
|
|
|
@param idx Level we are on:
|
|
- 0 = Total sum level
|
|
- 1 = First group changed (a)
|
|
- 2 = Second group changed (a,b)
|
|
|
|
@returns false if success, true if error
|
|
*/
|
|
|
|
bool JOIN::rollup_send_data(uint idx) {
|
|
uint save_slice = current_ref_item_slice;
|
|
for (uint i = send_group_parts; i-- > idx;) {
|
|
// Get references to sum functions in place
|
|
copy_ref_item_slice(ref_items[REF_SLICE_ACTIVE], rollup.ref_item_arrays[i]);
|
|
current_ref_item_slice = -1; // as we switched to a not-numbered slice
|
|
if (having_is_true(having_cond)) {
|
|
if (send_records < unit->select_limit_cnt && should_send_current_row() &&
|
|
select_lex->query_result()->send_data(thd, rollup.fields_list[i]))
|
|
return true;
|
|
send_records++;
|
|
}
|
|
}
|
|
// Restore ref_items array
|
|
set_ref_item_slice(save_slice);
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
Checks if an item has a ROLLUP NULL which needs to be written to
|
|
temp table.
|
|
|
|
@param item Item for which we need to detect if ROLLUP
|
|
NULL has to be written.
|
|
|
|
@returns false if ROLLUP NULL need not be written for this item.
|
|
true if it has to be written.
|
|
*/
|
|
|
|
bool has_rollup_result(Item *item) {
|
|
if (item->type() == Item::NULL_RESULT_ITEM) return true;
|
|
|
|
if (item->type() == Item::FUNC_ITEM) {
|
|
for (uint i = 0; i < ((Item_func *)item)->arg_count; i++) {
|
|
Item *real_item = ((Item_func *)item)->arguments()[i];
|
|
while (real_item->type() == Item::REF_ITEM)
|
|
real_item = *((down_cast<Item_ref *>(real_item))->ref);
|
|
|
|
if (real_item->type() == Item::NULL_RESULT_ITEM)
|
|
return true;
|
|
else if (real_item->type() == Item::FUNC_ITEM &&
|
|
has_rollup_result(real_item))
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
Write all rollup levels higher than the current one to a temp table.
|
|
|
|
@b SAMPLE
|
|
@code
|
|
SELECT a, b, SUM(c) FROM t1 GROUP BY a,b WITH ROLLUP
|
|
@endcode
|
|
|
|
@param idx Level we are on:
|
|
- 0 = Total sum level
|
|
- 1 = First group changed (a)
|
|
- 2 = Second group changed (a,b)
|
|
@param qep_tab temp table
|
|
|
|
@returns false if success, true if error
|
|
*/
|
|
|
|
bool JOIN::rollup_write_data(uint idx, QEP_TAB *qep_tab) {
|
|
uint save_slice = current_ref_item_slice;
|
|
for (uint i = send_group_parts; i-- > idx;) {
|
|
// Get references to sum functions in place
|
|
copy_ref_item_slice(ref_items[REF_SLICE_ACTIVE], rollup.ref_item_arrays[i]);
|
|
current_ref_item_slice = -1; // as we switched to a not-numbered slice
|
|
if (having_is_true(qep_tab->having)) {
|
|
int write_error;
|
|
for (Item &item : rollup.all_fields[i]) {
|
|
/*
|
|
Save the values of rollup expressions in the temporary table.
|
|
Unless it is a literal NULL value, make sure there is actually
|
|
a temporary table field created for it.
|
|
*/
|
|
if ((item.type() == Item::NULL_RESULT_ITEM) ||
|
|
(has_rollup_result(&item) && item.get_tmp_table_field() != nullptr))
|
|
item.save_in_result_field(1);
|
|
}
|
|
copy_sum_funcs(sum_funcs_end[i + 1], sum_funcs_end[i]);
|
|
TABLE *table_arg = qep_tab->table();
|
|
if ((write_error = table_arg->file->ha_write_row(table_arg->record[0]))) {
|
|
if (create_ondisk_from_heap(thd, table_arg, write_error, false, NULL))
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
set_ref_item_slice(save_slice); // Restore ref_items array
|
|
return false;
|
|
}
|
|
|
|
void JOIN::optimize_distinct() {
|
|
for (int i = primary_tables - 1; i >= 0; --i) {
|
|
QEP_TAB *last_tab = qep_tab + i;
|
|
if (select_lex->select_list_tables & last_tab->table_ref->map()) break;
|
|
last_tab->not_used_in_distinct = true;
|
|
}
|
|
|
|
/* Optimize "select distinct b from t1 order by key_part_1 limit #" */
|
|
if (order && skip_sort_order) {
|
|
/* Should already have been optimized away */
|
|
DBUG_ASSERT(m_ordered_index_usage == ORDERED_INDEX_ORDER_BY);
|
|
if (m_ordered_index_usage == ORDERED_INDEX_ORDER_BY) {
|
|
order = NULL;
|
|
}
|
|
}
|
|
}
|
|
|
|
bool prepare_sum_aggregators(Item_sum **func_ptr, bool need_distinct) {
|
|
Item_sum *func;
|
|
DBUG_TRACE;
|
|
while ((func = *(func_ptr++))) {
|
|
if (func->set_aggregator(need_distinct && func->has_with_distinct()
|
|
? Aggregator::DISTINCT_AGGREGATOR
|
|
: Aggregator::SIMPLE_AGGREGATOR))
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
/******************************************************************************
|
|
Code for calculating functions
|
|
******************************************************************************/
|
|
|
|
/**
|
|
Call @c setup() for all sum functions.
|
|
|
|
@param thd thread handler
|
|
@param func_ptr sum function list
|
|
|
|
@retval
|
|
false ok
|
|
@retval
|
|
true error
|
|
*/
|
|
|
|
bool setup_sum_funcs(THD *thd, Item_sum **func_ptr) {
|
|
Item_sum *func;
|
|
DBUG_TRACE;
|
|
while ((func = *(func_ptr++))) {
|
|
if (func->aggregator_setup(thd)) return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
void init_tmptable_sum_functions(Item_sum **func_ptr) {
|
|
DBUG_TRACE;
|
|
Item_sum *func;
|
|
while ((func = *(func_ptr++))) func->reset_field();
|
|
}
|
|
|
|
/** Update record 0 in tmp_table from record 1. */
|
|
|
|
void update_tmptable_sum_func(Item_sum **func_ptr,
|
|
TABLE *tmp_table MY_ATTRIBUTE((unused))) {
|
|
DBUG_TRACE;
|
|
Item_sum *func;
|
|
while ((func = *(func_ptr++))) func->update_field();
|
|
}
|
|
|
|
/** Copy result of sum functions to record in tmp_table. */
|
|
|
|
void copy_sum_funcs(Item_sum **func_ptr, Item_sum **end_ptr) {
|
|
DBUG_TRACE;
|
|
for (; func_ptr != end_ptr; func_ptr++) {
|
|
if ((*func_ptr)->result_field != nullptr) {
|
|
(*func_ptr)->save_in_result_field(1);
|
|
}
|
|
}
|
|
}
|
|
|
|
bool init_sum_functions(Item_sum **func_ptr, Item_sum **end_ptr) {
|
|
for (; func_ptr != end_ptr; func_ptr++) {
|
|
if ((*func_ptr)->reset_and_add()) return 1;
|
|
}
|
|
/* If rollup, calculate the upper sum levels */
|
|
for (; *func_ptr; func_ptr++) {
|
|
if ((*func_ptr)->aggregator_add()) return 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
bool update_sum_func(Item_sum **func_ptr) {
|
|
DBUG_TRACE;
|
|
Item_sum *func;
|
|
for (; (func = *func_ptr); func_ptr++)
|
|
if (func->aggregator_add()) return 1;
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
Copy result of functions to record in tmp_table.
|
|
|
|
Uses the thread pointer to check for errors in
|
|
some of the val_xxx() methods called by the
|
|
save_in_result_field() function.
|
|
TODO: make the Item::val_xxx() return error code
|
|
|
|
@param param Copy functions of tmp table specified by param
|
|
@param thd pointer to the current thread for error checking
|
|
@param type type of function Items that need to be copied (used
|
|
w.r.t windowing functions).
|
|
@retval
|
|
false if OK
|
|
@retval
|
|
true on error
|
|
*/
|
|
bool copy_funcs(Temp_table_param *param, const THD *thd, Copy_func_type type) {
|
|
DBUG_TRACE;
|
|
if (!param->items_to_copy->size()) return false;
|
|
|
|
Func_ptr_array *func_ptr = param->items_to_copy;
|
|
uint end = func_ptr->size();
|
|
for (uint i = 0; i < end; i++) {
|
|
Func_ptr &func = func_ptr->at(i);
|
|
Item *item = func.func();
|
|
bool do_copy = false;
|
|
switch (type) {
|
|
case CFT_ALL:
|
|
do_copy = true;
|
|
break;
|
|
case CFT_WF_FRAMING:
|
|
do_copy = (item->m_is_window_function &&
|
|
down_cast<Item_sum *>(item)->framing());
|
|
break;
|
|
case CFT_WF_NON_FRAMING:
|
|
do_copy = (item->m_is_window_function &&
|
|
!down_cast<Item_sum *>(item)->framing() &&
|
|
!down_cast<Item_sum *>(item)->needs_card());
|
|
break;
|
|
case CFT_WF_NEEDS_CARD:
|
|
do_copy = (item->m_is_window_function &&
|
|
down_cast<Item_sum *>(item)->needs_card());
|
|
break;
|
|
case CFT_WF_USES_ONLY_ONE_ROW:
|
|
do_copy = (item->m_is_window_function &&
|
|
down_cast<Item_sum *>(item)->uses_only_one_row());
|
|
break;
|
|
case CFT_HAS_NO_WF:
|
|
do_copy = !item->m_is_window_function && !item->has_wf();
|
|
break;
|
|
case CFT_HAS_WF:
|
|
do_copy = !item->m_is_window_function && item->has_wf();
|
|
break;
|
|
case CFT_WF:
|
|
do_copy = item->m_is_window_function;
|
|
break;
|
|
case CFT_DEPENDING_ON_AGGREGATE:
|
|
do_copy =
|
|
item->has_aggregation() && item->type() != Item::SUM_FUNC_ITEM;
|
|
break;
|
|
}
|
|
|
|
if (do_copy) {
|
|
if (func.override_result_field() == nullptr) {
|
|
item->save_in_result_field(/*no_conversions=*/true);
|
|
} else {
|
|
item->save_in_field(func.override_result_field(),
|
|
/*no_conversions=*/true);
|
|
}
|
|
/*
|
|
Need to check the THD error state because Item::val_xxx() don't
|
|
return error code, but can generate errors
|
|
TODO: change it for a real status check when Item::val_xxx()
|
|
are extended to return status code.
|
|
*/
|
|
if (thd->is_error()) return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
end_select-compatible function that writes the record into a sjm temptable
|
|
|
|
SYNOPSIS
|
|
end_sj_materialize()
|
|
join The join
|
|
join_tab Last join table
|
|
end_of_records false <=> This call is made to pass another record
|
|
combination
|
|
true <=> EOF (no action)
|
|
|
|
DESCRIPTION
|
|
This function is used by semi-join materialization to capture suquery's
|
|
resultset and write it into the temptable (that is, materialize it).
|
|
|
|
NOTE
|
|
This function is used only for semi-join materialization. Non-semijoin
|
|
materialization uses different mechanism.
|
|
|
|
RETURN
|
|
NESTED_LOOP_OK
|
|
NESTED_LOOP_ERROR
|
|
*/
|
|
|
|
static enum_nested_loop_state end_sj_materialize(JOIN *join, QEP_TAB *qep_tab,
|
|
bool end_of_records) {
|
|
int error;
|
|
THD *thd = join->thd;
|
|
Semijoin_mat_exec *sjm = qep_tab[-1].sj_mat_exec();
|
|
DBUG_TRACE;
|
|
if (!end_of_records) {
|
|
TABLE *table = sjm->table;
|
|
|
|
for (Item &item : sjm->sj_nest->nested_join->sj_inner_exprs) {
|
|
if (item.is_null()) return NESTED_LOOP_OK;
|
|
}
|
|
fill_record(thd, table, table->visible_field_ptr(),
|
|
sjm->sj_nest->nested_join->sj_inner_exprs, NULL, NULL, false);
|
|
if (thd->is_error()) return NESTED_LOOP_ERROR; /* purecov: inspected */
|
|
if (!check_unique_constraint(table)) return NESTED_LOOP_OK;
|
|
if ((error = table->file->ha_write_row(table->record[0]))) {
|
|
/* create_ondisk_from_heap will generate error if needed */
|
|
if (!table->file->is_ignorable_error(error)) {
|
|
if (create_ondisk_from_heap(thd, table, error, true, NULL))
|
|
return NESTED_LOOP_ERROR; /* purecov: inspected */
|
|
/* Initialize the index, since create_ondisk_from_heap does
|
|
not replicate the earlier index initialization */
|
|
if (table->hash_field) table->file->ha_index_init(0, false);
|
|
}
|
|
}
|
|
}
|
|
return NESTED_LOOP_OK;
|
|
}
|
|
|
|
/**
|
|
Check appearance of new constant items in multiple equalities
|
|
of a condition after reading a constant table.
|
|
|
|
The function retrieves the cond condition and for each encountered
|
|
multiple equality checks whether new constants have appeared after
|
|
reading the constant (single row) table tab. If so it adjusts
|
|
the multiple equality appropriately.
|
|
|
|
@param thd thread handler
|
|
@param cond condition whose multiple equalities are to be checked
|
|
@param tab constant table that has been read
|
|
*/
|
|
|
|
static bool update_const_equal_items(THD *thd, Item *cond, JOIN_TAB *tab) {
|
|
if (!(cond->used_tables() & tab->table_ref->map())) return false;
|
|
|
|
if (cond->type() == Item::COND_ITEM) {
|
|
for (Item &item : *(down_cast<Item_cond *>(cond))->argument_list()) {
|
|
if (update_const_equal_items(thd, &item, tab)) return true;
|
|
}
|
|
} else if (cond->type() == Item::FUNC_ITEM &&
|
|
down_cast<Item_func *>(cond)->functype() ==
|
|
Item_func::MULT_EQUAL_FUNC) {
|
|
Item_equal *item_equal = (Item_equal *)cond;
|
|
bool contained_const = item_equal->get_const() != NULL;
|
|
if (item_equal->update_const(thd)) return true;
|
|
if (!contained_const && item_equal->get_const()) {
|
|
/* Update keys for range analysis */
|
|
Item_equal_iterator it(*item_equal);
|
|
Item_field *item_field;
|
|
while ((item_field = it++)) {
|
|
Field *field = item_field->field;
|
|
JOIN_TAB *stat = field->table->reginfo.join_tab;
|
|
Key_map possible_keys = field->key_start;
|
|
possible_keys.intersect(field->table->keys_in_use_for_query);
|
|
stat[0].const_keys.merge(possible_keys);
|
|
stat[0].keys().merge(possible_keys);
|
|
|
|
/*
|
|
For each field in the multiple equality (for which we know that it
|
|
is a constant) we have to find its corresponding key part, and set
|
|
that key part in const_key_parts.
|
|
*/
|
|
if (!possible_keys.is_clear_all()) {
|
|
TABLE *const table = field->table;
|
|
for (Key_use *use = stat->keyuse();
|
|
use && use->table_ref == item_field->table_ref; use++) {
|
|
if (possible_keys.is_set(use->key) &&
|
|
table->key_info[use->key].key_part[use->keypart].field == field)
|
|
table->const_key_parts[use->key] |= use->keypart_map;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
For some reason, e.g. due to an impossible WHERE clause, the tables cannot
|
|
possibly contain any rows that will be in the result. This function
|
|
is used to return with a result based on no matching rows (i.e., an
|
|
empty result or one row with aggregates calculated without using
|
|
rows in the case of implicit grouping) before the execution of
|
|
nested loop join.
|
|
|
|
This function may evaluate the HAVING clause and is only meant for
|
|
result sets that are empty due to an impossible HAVING clause. Do
|
|
not use it if HAVING has already been evaluated.
|
|
|
|
@param join The join that does not produce a row
|
|
@param fields Fields in result
|
|
*/
|
|
static void return_zero_rows(JOIN *join, List<Item> &fields) {
|
|
DBUG_TRACE;
|
|
|
|
join->join_free();
|
|
|
|
/* Update results for FOUND_ROWS */
|
|
if (!join->send_row_on_empty_set()) {
|
|
join->thd->current_found_rows = 0;
|
|
}
|
|
|
|
SELECT_LEX *const select = join->select_lex;
|
|
THD *thd = join->thd;
|
|
|
|
if (!(select->query_result()->send_result_set_metadata(
|
|
thd, fields, Protocol::SEND_NUM_ROWS | Protocol::SEND_EOF))) {
|
|
bool send_error = false;
|
|
if (join->send_row_on_empty_set()) {
|
|
// Mark tables as containing only NULL values
|
|
for (TABLE_LIST *table = select->leaf_tables; table;
|
|
table = table->next_leaf)
|
|
table->table->set_null_row();
|
|
|
|
// Calculate aggregate functions for no rows
|
|
|
|
/*
|
|
Must notify all fields that there are no rows (not only those
|
|
that will be returned) because join->having may refer to
|
|
fields that are not part of the result columns.
|
|
*/
|
|
for (Item &item : join->all_fields) {
|
|
item.no_rows_in_result();
|
|
}
|
|
|
|
if (having_is_true(join->having_cond) && join->should_send_current_row())
|
|
send_error = select->query_result()->send_data(thd, fields);
|
|
}
|
|
if (!send_error) select->query_result()->send_eof(thd); // Should be safe
|
|
}
|
|
}
|
|
|
|
/**
|
|
@brief Setup write_func of QEP_tmp_table object
|
|
|
|
@param tab QEP_TAB of a tmp table
|
|
@param trace Opt_trace_object to add to
|
|
@details
|
|
Function sets up write_func according to how QEP_tmp_table object that
|
|
is attached to the given join_tab will be used in the query.
|
|
*/
|
|
|
|
void setup_tmptable_write_func(QEP_TAB *tab, Opt_trace_object *trace) {
|
|
DBUG_TRACE;
|
|
JOIN *join = tab->join();
|
|
TABLE *table = tab->table();
|
|
QEP_tmp_table *op = (QEP_tmp_table *)tab->op;
|
|
Temp_table_param *const tmp_tbl = tab->tmp_table_param;
|
|
uint phase = tab->ref_item_slice;
|
|
const char *description = nullptr;
|
|
DBUG_ASSERT(table && op);
|
|
|
|
if (table->group && tmp_tbl->sum_func_count &&
|
|
!tmp_tbl->precomputed_group_by) {
|
|
/*
|
|
Note for MyISAM tmp tables: if uniques is true keys won't be
|
|
created.
|
|
*/
|
|
DBUG_ASSERT(phase < REF_SLICE_WIN_1);
|
|
if (table->s->keys) {
|
|
description = "continuously_update_group_row";
|
|
op->set_write_func(end_update);
|
|
}
|
|
} else if (join->streaming_aggregation && !tmp_tbl->precomputed_group_by) {
|
|
DBUG_ASSERT(phase < REF_SLICE_WIN_1);
|
|
description = "write_group_row_when_complete";
|
|
DBUG_PRINT("info", ("Using end_write_group"));
|
|
op->set_write_func(end_write_group);
|
|
} else {
|
|
description = "write_all_rows";
|
|
op->set_write_func(phase >= REF_SLICE_WIN_1 ? end_write_wf : end_write);
|
|
if (tmp_tbl->precomputed_group_by) {
|
|
Item_sum **func_ptr = join->sum_funcs;
|
|
Item_sum *func;
|
|
while ((func = *(func_ptr++))) {
|
|
tmp_tbl->items_to_copy->push_back(Func_ptr(func));
|
|
}
|
|
}
|
|
}
|
|
if (description) trace->add_alnum("write_method", description);
|
|
}
|
|
|
|
/**
|
|
@details
|
|
Rows produced by a join sweep may end up in a temporary table or be sent
|
|
to a client. Setup the function of the nested loop join algorithm which
|
|
handles final fully constructed and matched records.
|
|
|
|
@return
|
|
end_select function to use. This function can't fail.
|
|
*/
|
|
Next_select_func JOIN::get_end_select_func() {
|
|
DBUG_TRACE;
|
|
/*
|
|
Choose method for presenting result to user. Use end_send_group
|
|
if the query requires grouping (has a GROUP BY clause and/or one or
|
|
more aggregate functions). Use end_send if the query should not
|
|
be grouped.
|
|
*/
|
|
if (streaming_aggregation && !tmp_table_param.precomputed_group_by) {
|
|
DBUG_PRINT("info", ("Using end_send_group"));
|
|
return end_send_group;
|
|
}
|
|
DBUG_PRINT("info", ("Using end_send"));
|
|
return end_send;
|
|
}
|
|
|
|
/**
|
|
Find out how many bytes it takes to store the smallest prefix which
|
|
covers all the columns that will be read from a table.
|
|
|
|
@param qep_tab the table to read
|
|
@return the size of the smallest prefix that covers all records to be
|
|
read from the table
|
|
*/
|
|
static size_t record_prefix_size(const QEP_TAB *qep_tab) {
|
|
const TABLE *table = qep_tab->table();
|
|
|
|
/*
|
|
Find the end of the last column that is read, or the beginning of
|
|
the record if no column is read.
|
|
|
|
We want the column that is physically last in table->record[0],
|
|
which is not necessarily the column that is last in table->field.
|
|
For example, virtual columns come at the end of the record, even
|
|
if they are not at the end of table->field. This means we need to
|
|
inspect all the columns in the read set and take the one with the
|
|
highest end pointer.
|
|
*/
|
|
uchar *prefix_end = table->record[0]; // beginning of record
|
|
for (auto f = table->field, end = table->field + table->s->fields; f < end;
|
|
++f) {
|
|
if (bitmap_is_set(table->read_set, (*f)->field_index))
|
|
prefix_end = std::max(prefix_end, (*f)->ptr + (*f)->pack_length());
|
|
}
|
|
|
|
/*
|
|
If this is an index merge, the primary key columns may be required
|
|
for positioning in a later stage, even though they are not in the
|
|
read_set here. Allocate space for them in case they are needed.
|
|
Also allocate space for them for dynamic ranges, because they can
|
|
switch to index merge for a subsequent scan.
|
|
*/
|
|
if ((qep_tab->type() == JT_INDEX_MERGE || qep_tab->dynamic_range()) &&
|
|
!table->s->is_missing_primary_key() &&
|
|
(table->file->ha_table_flags() & HA_PRIMARY_KEY_REQUIRED_FOR_POSITION)) {
|
|
const KEY &key = table->key_info[table->s->primary_key];
|
|
for (auto kp = key.key_part, end = kp + key.user_defined_key_parts;
|
|
kp < end; ++kp) {
|
|
const Field *f = table->field[kp->fieldnr - 1];
|
|
/*
|
|
If a key column comes after all the columns in the read set,
|
|
extend the prefix to include the key column.
|
|
*/
|
|
prefix_end = std::max(prefix_end, f->ptr + f->pack_length());
|
|
}
|
|
}
|
|
|
|
return prefix_end - table->record[0];
|
|
}
|
|
|
|
/**
|
|
Allocate a data buffer that the storage engine can use for fetching
|
|
batches of records.
|
|
|
|
A buffer is only allocated if ha_is_record_buffer_wanted() returns true
|
|
for the handler, and the scan in question is of a kind that could be
|
|
expected to benefit from fetching records in batches.
|
|
|
|
@param tab the table to read
|
|
@retval true if an error occurred when allocating the buffer
|
|
@retval false if a buffer was successfully allocated, or if a buffer
|
|
was not attempted allocated
|
|
*/
|
|
bool set_record_buffer(const QEP_TAB *tab) {
|
|
if (tab == nullptr) return false;
|
|
|
|
TABLE *const table = tab->table();
|
|
|
|
DBUG_ASSERT(table->file->inited);
|
|
DBUG_ASSERT(table->file->ha_get_record_buffer() == nullptr);
|
|
|
|
// Skip temporary tables.
|
|
if (tab->position() == nullptr) return false;
|
|
|
|
// Don't allocate a buffer for loose index scan.
|
|
if (tab->quick_optim() && tab->quick_optim()->is_loose_index_scan())
|
|
return false;
|
|
|
|
// Only create a buffer if the storage engine wants it.
|
|
ha_rows max_rows = 0;
|
|
if (!table->file->ha_is_record_buffer_wanted(&max_rows) || max_rows == 0)
|
|
return false;
|
|
|
|
// If we already have a buffer, reuse it.
|
|
if (table->m_record_buffer.max_records() > 0) {
|
|
/*
|
|
Assume that the existing buffer has the shape we want. That is, the
|
|
record size shouldn't change for a table during execution.
|
|
*/
|
|
DBUG_ASSERT(table->m_record_buffer.record_size() ==
|
|
record_prefix_size(tab));
|
|
table->m_record_buffer.reset();
|
|
table->file->ha_set_record_buffer(&table->m_record_buffer);
|
|
return false;
|
|
}
|
|
|
|
// How many rows do we expect to fetch?
|
|
double rows_to_fetch = tab->position()->rows_fetched;
|
|
|
|
/*
|
|
If this is the outer table of a join and there is a limit defined
|
|
on the query block, adjust the buffer size accordingly.
|
|
*/
|
|
const JOIN *const join = tab->join();
|
|
if (tab->idx() == 0 && join->m_select_limit != HA_POS_ERROR) {
|
|
/*
|
|
Estimated number of rows returned by the join per qualifying row
|
|
in the outer table.
|
|
*/
|
|
double fanout = 1.0;
|
|
for (uint i = 1; i < join->primary_tables; i++) {
|
|
const auto p = join->qep_tab[i].position();
|
|
fanout *= p->rows_fetched * p->filter_effect;
|
|
}
|
|
|
|
/*
|
|
The number of qualifying rows to read from the outer table in
|
|
order to reach the limit is limit / fanout. Divide by
|
|
filter_effect to get the total number of qualifying and
|
|
non-qualifying rows to fetch to reach the limit.
|
|
*/
|
|
rows_to_fetch = std::min(rows_to_fetch, join->m_select_limit / fanout /
|
|
tab->position()->filter_effect);
|
|
}
|
|
|
|
ha_rows rows_in_buffer = static_cast<ha_rows>(std::ceil(rows_to_fetch));
|
|
|
|
// No need for a multi-row buffer if we don't expect multiple rows.
|
|
if (rows_in_buffer <= 1) return false;
|
|
|
|
/*
|
|
How much space do we need to allocate for each record? Enough to
|
|
hold all columns from the beginning and up to the last one in the
|
|
read set. We don't need to allocate space for unread columns at
|
|
the end of the record.
|
|
*/
|
|
const size_t record_size = record_prefix_size(tab);
|
|
|
|
// Do not allocate a buffer whose total size exceeds MAX_RECORD_BUFFER_SIZE.
|
|
if (record_size > 0)
|
|
rows_in_buffer =
|
|
std::min<ha_rows>(MAX_RECORD_BUFFER_SIZE / record_size, rows_in_buffer);
|
|
|
|
// Do not allocate space for more rows than the handler asked for.
|
|
rows_in_buffer = std::min(rows_in_buffer, max_rows);
|
|
|
|
const auto bufsize = Record_buffer::buffer_size(rows_in_buffer, record_size);
|
|
const auto ptr = static_cast<uchar *>(table->in_use->alloc(bufsize));
|
|
if (ptr == nullptr) return true; /* purecov: inspected */
|
|
|
|
table->m_record_buffer = Record_buffer{rows_in_buffer, record_size, ptr};
|
|
table->file->ha_set_record_buffer(&table->m_record_buffer);
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
Split AND conditions into their constituent parts, recursively.
|
|
Conditions that are not AND conditions are appended unchanged onto
|
|
condition_parts. E.g. if you have ((a AND b) AND c), condition_parts
|
|
will contain [a, b, c], plus whatever it contained before the call.
|
|
*/
|
|
static void ExtractConditions(Item *condition,
|
|
vector<Item *> *condition_parts) {
|
|
if (condition == nullptr) {
|
|
return;
|
|
}
|
|
if (condition->type() != Item::COND_ITEM ||
|
|
down_cast<Item_cond *>(condition)->functype() !=
|
|
Item_bool_func2::COND_AND_FUNC) {
|
|
condition_parts->push_back(condition);
|
|
return;
|
|
}
|
|
|
|
Item_cond_and *and_condition = down_cast<Item_cond_and *>(condition);
|
|
for (Item &item : *and_condition->argument_list()) {
|
|
ExtractConditions(&item, condition_parts);
|
|
}
|
|
}
|
|
|
|
/**
|
|
Return a new iterator that wraps "iterator" and that tests all of the given
|
|
conditions (if any), ANDed together. If there are no conditions, just return
|
|
the given iterator back.
|
|
*/
|
|
unique_ptr_destroy_only<RowIterator> PossiblyAttachFilterIterator(
|
|
unique_ptr_destroy_only<RowIterator> iterator,
|
|
const vector<Item *> &conditions, THD *thd) {
|
|
if (conditions.empty()) {
|
|
return iterator;
|
|
}
|
|
|
|
Item *condition = nullptr;
|
|
if (conditions.size() == 1) {
|
|
condition = conditions[0];
|
|
} else {
|
|
List<Item> items;
|
|
for (Item *condition : conditions) {
|
|
items.push_back(condition);
|
|
}
|
|
condition = new Item_cond_and(items);
|
|
condition->quick_fix_field();
|
|
condition->update_used_tables();
|
|
condition->apply_is_true();
|
|
}
|
|
|
|
RowIterator *child_iterator = iterator.get();
|
|
unique_ptr_destroy_only<RowIterator> filter_iterator =
|
|
NewIterator<FilterIterator>(thd, move(iterator), condition);
|
|
|
|
// Copy costs (we don't care about filter_effect here, even though we
|
|
// should).
|
|
filter_iterator->set_expected_rows(child_iterator->expected_rows());
|
|
filter_iterator->set_estimated_cost(child_iterator->estimated_cost());
|
|
|
|
return filter_iterator;
|
|
}
|
|
|
|
unique_ptr_destroy_only<RowIterator> CreateNestedLoopIterator(
|
|
THD *thd, unique_ptr_destroy_only<RowIterator> left_iterator,
|
|
unique_ptr_destroy_only<RowIterator> right_iterator, JoinType join_type,
|
|
bool pfs_batch_mode) {
|
|
if (join_type == JoinType::ANTI || join_type == JoinType::SEMI) {
|
|
// This does not make sense as an optimization for anti- or semijoins.
|
|
pfs_batch_mode = false;
|
|
}
|
|
|
|
return NewIterator<NestedLoopIterator>(thd, move(left_iterator),
|
|
move(right_iterator), join_type,
|
|
pfs_batch_mode);
|
|
}
|
|
|
|
static unique_ptr_destroy_only<RowIterator> CreateInvalidatorIterator(
|
|
THD *thd, QEP_TAB *qep_tab, unique_ptr_destroy_only<RowIterator> iterator) {
|
|
RowIterator *child_iterator = iterator.get();
|
|
|
|
unique_ptr_destroy_only<RowIterator> invalidator =
|
|
NewIterator<CacheInvalidatorIterator>(thd, move(iterator),
|
|
qep_tab->table()->alias);
|
|
|
|
// Copy costs.
|
|
invalidator->set_expected_rows(child_iterator->expected_rows());
|
|
invalidator->set_estimated_cost(child_iterator->estimated_cost());
|
|
|
|
table_map deps = qep_tab->lateral_derived_tables_depend_on_me;
|
|
for (QEP_TAB **tab2 = qep_tab->join()->map2qep_tab; deps;
|
|
tab2++, deps >>= 1) {
|
|
if (!(deps & 1)) continue;
|
|
if ((*tab2)->invalidators == nullptr) {
|
|
(*tab2)->invalidators = new (thd->mem_root)
|
|
Mem_root_array<const CacheInvalidatorIterator *>(thd->mem_root);
|
|
}
|
|
(*tab2)->invalidators->push_back(
|
|
down_cast<CacheInvalidatorIterator *>(invalidator->real_iterator()));
|
|
}
|
|
return invalidator;
|
|
}
|
|
|
|
static unique_ptr_destroy_only<RowIterator> PossiblyAttachFilterIterator(
|
|
unique_ptr_destroy_only<RowIterator> iterator,
|
|
const vector<PendingCondition> &conditions, THD *thd) {
|
|
vector<Item *> stripped_conditions;
|
|
for (const PendingCondition &cond : conditions) {
|
|
stripped_conditions.push_back(cond.cond);
|
|
}
|
|
return PossiblyAttachFilterIterator(move(iterator), stripped_conditions, thd);
|
|
}
|
|
|
|
static Item_func_trig_cond *GetTriggerCondOrNull(Item *item) {
|
|
if (item->type() == Item::FUNC_ITEM &&
|
|
down_cast<Item_func *>(item)->functype() ==
|
|
Item_bool_func2::TRIG_COND_FUNC) {
|
|
return down_cast<Item_func_trig_cond *>(item);
|
|
} else {
|
|
return nullptr;
|
|
}
|
|
}
|
|
|
|
enum CallingContext {
|
|
TOP_LEVEL,
|
|
DIRECTLY_UNDER_SEMIJOIN,
|
|
DIRECTLY_UNDER_OUTER_JOIN,
|
|
DIRECTLY_UNDER_WEEDOUT
|
|
};
|
|
|
|
/**
|
|
For historical reasons, derived table materialization and temporary
|
|
table materialization didn't specify the fields to materialize in the
|
|
same way. Temporary table materialization used copy_fields() and
|
|
copy_funcs() (also reused for aggregation; see the comments on
|
|
AggregateIterator for the relation between aggregations and temporary
|
|
tables) to get the data into the Field pointers of the temporary table
|
|
to be written, storing the lists in copy_fields and items_to_copy.
|
|
|
|
However, derived table materialization used JOIN::fields (which is a
|
|
set of Item, not Field!) for the same purpose, calling fill_record()
|
|
(which originally was meant for INSERT and UPDATE) instead. Thus, we
|
|
have to rewrite one to the other, so that we can have only one
|
|
MaterializeIterator. We choose to rewrite JOIN::fields to
|
|
copy_fields/items_to_copy.
|
|
|
|
TODO: The optimizer should output just one kind of structure directly.
|
|
*/
|
|
void ConvertItemsToCopy(List<Item> *items, Field **fields,
|
|
Temp_table_param *param, JOIN *join) {
|
|
DBUG_ASSERT(param->items_to_copy == nullptr);
|
|
|
|
const bool replaced_items_for_rollup =
|
|
(join != nullptr && join->replaced_items_for_rollup);
|
|
|
|
// All fields are to be copied.
|
|
Func_ptr_array *copy_func =
|
|
new (current_thd->mem_root) Func_ptr_array(current_thd->mem_root);
|
|
Field **field_ptr = fields;
|
|
for (Item &item : *items) {
|
|
Item *real_item = item.real_item();
|
|
if (real_item->type() == Item::FIELD_ITEM) {
|
|
Field *from_field = (pointer_cast<Item_field *>(real_item))->field;
|
|
Field *to_field = *field_ptr;
|
|
param->copy_fields.emplace_back(to_field, from_field, /*save=*/true);
|
|
|
|
// If any of the Item_null_result items are set to save in this field,
|
|
// forward them to the new field instead. See below for the result fields
|
|
// for the other items.
|
|
if (replaced_items_for_rollup) {
|
|
for (size_t rollup_level = 0; rollup_level < join->send_group_parts;
|
|
++rollup_level) {
|
|
for (Item &item : join->rollup.fields_list[rollup_level]) {
|
|
if (item.type() == Item::NULL_RESULT_ITEM &&
|
|
item.get_result_field() == from_field) {
|
|
item.set_result_field(to_field);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
} else if (item.real_item()->is_result_field()) {
|
|
Field *from_field = item.real_item()->get_result_field();
|
|
Field *to_field = *field_ptr;
|
|
item.set_result_field(to_field);
|
|
copy_func->push_back(Func_ptr(&item));
|
|
|
|
// Similarly to above, set the right result field for any aggregates
|
|
// that we might output as part of rollup.
|
|
if (replaced_items_for_rollup && &item != real_item) {
|
|
for (Item_sum **func_ptr = join->sum_funcs;
|
|
func_ptr != join->sum_funcs_end[join->send_group_parts];
|
|
++func_ptr) {
|
|
if ((*func_ptr)->get_result_field() == from_field) {
|
|
(*func_ptr)->set_result_field(to_field);
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
Func_ptr ptr(&item);
|
|
ptr.set_override_result_field(*field_ptr);
|
|
copy_func->push_back(ptr);
|
|
}
|
|
++field_ptr;
|
|
}
|
|
param->items_to_copy = copy_func;
|
|
|
|
if (replaced_items_for_rollup) {
|
|
// Patch up the rollup items so that they save in the same field as
|
|
// the ref would. This is required because we call save_in_result_field()
|
|
// directly on each field in the rollup field list
|
|
// (in AggregateIterator::Read), not on the Item_ref in join->fields.
|
|
for (size_t rollup_level = 0; rollup_level < join->send_group_parts;
|
|
++rollup_level) {
|
|
List_STL_Iterator<Item> item_it = join->fields->begin();
|
|
for (Item &item : join->rollup.fields_list[rollup_level]) {
|
|
// For cases where we need an Item_null_result, the field in
|
|
// join->fields often does not have the right result field set.
|
|
// However, the Item_null_result field does after we patched it
|
|
// up earlier in the function.
|
|
if (item.type() != Item::NULL_RESULT_ITEM) {
|
|
item.set_result_field(item_it->get_result_field());
|
|
}
|
|
++item_it;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/** Similar to PendingCondition, but for cache invalidator iterators. */
|
|
struct PendingInvalidator {
|
|
/**
|
|
The table whose every (post-join) row invalidates one or more derived
|
|
lateral tables.
|
|
*/
|
|
QEP_TAB *qep_tab;
|
|
int table_index_to_attach_to; // -1 means “on the last possible outer join”.
|
|
};
|
|
|
|
/*
|
|
There are three kinds of conditions stored into a table's QEP_TAB object:
|
|
|
|
1. Join conditions (where not optimized into EQ_REF accesses or similar).
|
|
These are attached as a condition on the rightmost table of the join;
|
|
if it's an outer join, they are wrapped in a “not_null_compl”
|
|
condition, to mark that they should not be applied to the NULL values
|
|
synthesized when no row is found. These can be kept on the table, and
|
|
we don't really need the not_null_compl wrapper as long as we don't
|
|
move the condition up above the join (which we don't).
|
|
|
|
2. WHERE predicates referring to the table, and possibly also one or more
|
|
earlier tables in the join. These should normally be kept on the table,
|
|
so we can discard rows as early as possible (but see next point).
|
|
We should test these after the join conditions, though, as they may
|
|
have side effects. Also note that these may be pushed below sort
|
|
operations for efficiency -- in fact, they already have, so we should
|
|
not try to re-apply them.
|
|
|
|
3. Predicates like in #2 that are on the inner (right) side of a
|
|
left join. These conditions must be moved _above_ the join, as they
|
|
should also be tested for NULL-complemented rows the join may generate.
|
|
E.g., for t1 LEFT JOIN t2 WHERE t1.x + t2.x > 3, the condition will be
|
|
attached to t2's QEP_TAB, but needs to be attached above the join, or
|
|
it would erroneously keep rows wherever t2 did not produce a
|
|
(real) row. Such conditions are marked with a “found” trigger (in the
|
|
old execution engine, which tested qep_tab->condition() both before and
|
|
after the join, it would need to be exempt from the first test).
|
|
|
|
4. Predicates that are #1 _and_ #3. These can happen with more complicated
|
|
outer joins; e.g., with t1 LEFT JOIN ( t2 LEFT JOIN t3 ON <x> ) ON <y>,
|
|
the <x> join condition (posted on t3) should be above one join but
|
|
below the other.
|
|
|
|
TODO: The optimizer should distinguish between before-join and
|
|
after-join conditions to begin with, instead of us having to untangle
|
|
it here.
|
|
*/
|
|
void SplitConditions(Item *condition, vector<Item *> *predicates_below_join,
|
|
vector<PendingCondition> *predicates_above_join) {
|
|
vector<Item *> condition_parts;
|
|
ExtractConditions(condition, &condition_parts);
|
|
for (Item *item : condition_parts) {
|
|
Item_func_trig_cond *trig_cond = GetTriggerCondOrNull(item);
|
|
if (trig_cond != nullptr) {
|
|
Item *inner_cond = trig_cond->arguments()[0];
|
|
if (trig_cond->get_trig_type() == Item_func_trig_cond::FOUND_MATCH) {
|
|
// A WHERE predicate on the table that needs to be pushed up above the
|
|
// join (case #3 above). Push it up to above the last outer join.
|
|
predicates_above_join->push_back(PendingCondition{inner_cond, -1});
|
|
} else if (trig_cond->get_trig_type() ==
|
|
Item_func_trig_cond::IS_NOT_NULL_COMPL) {
|
|
// It's a join condition, so it should nominally go directly onto the
|
|
// table. If it _also_ has a FOUND_MATCH predicate, we are dealing
|
|
// with case #4 above, and need to push it up to exactly the right
|
|
// spot.
|
|
//
|
|
// There is a special exception here for anti-joins; see the code under
|
|
// qep_tab->table()->reginfo.not_exists_optimize in ConnectJoins().
|
|
Item_func_trig_cond *inner_trig_cond = GetTriggerCondOrNull(inner_cond);
|
|
if (inner_trig_cond != nullptr) {
|
|
Item *inner_inner_cond = inner_trig_cond->arguments()[0];
|
|
predicates_above_join->push_back(
|
|
PendingCondition{inner_inner_cond, inner_trig_cond->idx()});
|
|
} else {
|
|
predicates_below_join->push_back(inner_cond);
|
|
}
|
|
} else {
|
|
predicates_below_join->push_back(item);
|
|
}
|
|
} else {
|
|
predicates_below_join->push_back(item);
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
For a given duplicate weedout operation, figure out which tables are supposed
|
|
to be deduplicated by it, and add those to unhandled_duplicates. (SJ_TMP_TABLE
|
|
contains the deduplication key, which is exactly the complement of the tables
|
|
to be deduplicated.)
|
|
*/
|
|
static void MarkUnhandledDuplicates(QEP_TAB *qep_tabs, SJ_TMP_TABLE *weedout,
|
|
plan_idx weedout_start,
|
|
plan_idx weedout_end,
|
|
vector<QEP_TAB *> *unhandled_duplicates) {
|
|
if (weedout->is_confluent) {
|
|
// Confluent weedout doesn't have tabs or tabs_end set; it just implicitly
|
|
// says none of the tables are allowed to produce duplicates.
|
|
for (plan_idx i = weedout_start; i < weedout_end; ++i) {
|
|
unhandled_duplicates->push_back(&qep_tabs[i]);
|
|
}
|
|
} else {
|
|
bool part_of_key[MAX_TABLES] = {false};
|
|
for (SJ_TMP_TABLE::TAB *tab = weedout->tabs; tab != weedout->tabs_end;
|
|
++tab) {
|
|
plan_idx i = tab->qep_tab - qep_tabs;
|
|
DBUG_ASSERT(i >= weedout_start);
|
|
DBUG_ASSERT(i < weedout_end);
|
|
DBUG_ASSERT(i < plan_idx{MAX_TABLES});
|
|
part_of_key[i] = true;
|
|
}
|
|
for (plan_idx i = weedout_start; i < weedout_end; ++i) {
|
|
if (!part_of_key[i]) {
|
|
unhandled_duplicates->push_back(&qep_tabs[i]);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static unique_ptr_destroy_only<RowIterator> CreateWeedoutIterator(
|
|
THD *thd, unique_ptr_destroy_only<RowIterator> iterator,
|
|
SJ_TMP_TABLE *weedout_table) {
|
|
if (weedout_table->is_confluent) {
|
|
// A “confluent” weedout is one that deduplicates on all the
|
|
// fields. If so, we can drop the complexity of the WeedoutIterator
|
|
// and simply insert a LIMIT 1.
|
|
return NewIterator<LimitOffsetIterator>(
|
|
thd, move(iterator), /*limit=*/1, /*offset=*/0,
|
|
/*count_all_rows=*/false, /*skipped_rows=*/nullptr);
|
|
} else {
|
|
return NewIterator<WeedoutIterator>(thd, move(iterator), weedout_table);
|
|
}
|
|
}
|
|
|
|
static unique_ptr_destroy_only<RowIterator> CreateWeedoutIteratorForTables(
|
|
THD *thd, const vector<QEP_TAB *> &tables_to_deduplicate, QEP_TAB *qep_tabs,
|
|
uint primary_tables, unique_ptr_destroy_only<RowIterator> iterator) {
|
|
bool need_dup_removal[MAX_TABLES] = {false};
|
|
for (QEP_TAB *qep_tab : tables_to_deduplicate) {
|
|
plan_idx i = qep_tab - qep_tabs;
|
|
DBUG_ASSERT(i >= 0);
|
|
DBUG_ASSERT(static_cast<uint>(i) < primary_tables);
|
|
need_dup_removal[i] = true;
|
|
}
|
|
|
|
Prealloced_array<SJ_TMP_TABLE::TAB, MAX_TABLES> sj_tabs(PSI_NOT_INSTRUMENTED);
|
|
for (uint i = 0; i < primary_tables; ++i) {
|
|
if (!need_dup_removal[i]) {
|
|
SJ_TMP_TABLE::TAB sj_tab;
|
|
sj_tab.qep_tab = &qep_tabs[i];
|
|
sj_tabs.push_back(sj_tab);
|
|
|
|
// See JOIN::add_sorting_to_table() for rationale.
|
|
Filesort *filesort = qep_tabs[i].filesort;
|
|
if (filesort != nullptr) {
|
|
DBUG_ASSERT(filesort->m_sort_param.m_addon_fields_status ==
|
|
Addon_fields_status::unknown_status);
|
|
filesort->m_force_sort_positions = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
JOIN *join = tables_to_deduplicate[0]->join();
|
|
SJ_TMP_TABLE *sjtbl =
|
|
create_sj_tmp_table(thd, join, &sj_tabs[0], &sj_tabs[0] + sj_tabs.size());
|
|
return CreateWeedoutIterator(thd, move(iterator), sjtbl);
|
|
}
|
|
|
|
/**
|
|
Find out whether there is a first-match jump from "last_idx" to before
|
|
"first_idx".
|
|
|
|
This is made a bit trickier by the existence of “split jumps”. A split jump is
|
|
set up when there is a semijoin against two or more inner tables, but the join
|
|
optimizer has decided to put more tables in-between. Consider e.g. the query
|
|
|
|
(A sj (B ij C)) ij D
|
|
|
|
where the join optimizer has decided that the order should be A, B, D, C!
|
|
In this case, there's no direct jump from C to A, but a jump first from C to
|
|
D, and then when D has been processed, a jump back from B to A. As with other
|
|
non-hierarchical semijoin setups, we don't try to execute these directly,
|
|
but rather set them up as weedouts, by adding elements to
|
|
"unhandled_duplicates".
|
|
|
|
@return true if there is a first match jump, split or not, from "last_idx" to
|
|
just before "split_idx". If it's a split jump, "is_split" is set to true.
|
|
(If the function returns false, "is_split" is undefined.)
|
|
*/
|
|
static bool FirstMatchBetween(QEP_TAB *qep_tabs, const plan_idx first_idx,
|
|
const plan_idx last_idx, bool *is_split,
|
|
vector<QEP_TAB *> *unhandled_duplicates) {
|
|
if (qep_tabs[last_idx].match_tab != last_idx) {
|
|
// last_idx doesn't contain a first match, or its first match is
|
|
// the middle part of a split jump. Ignore.
|
|
return false;
|
|
}
|
|
if (qep_tabs[last_idx].firstmatch_return == first_idx - 1) {
|
|
for (plan_idx i = 0; i < first_idx; ++i) {
|
|
if (qep_tabs[i].firstmatch_return != NO_PLAN_IDX &&
|
|
qep_tabs[i].match_tab == last_idx) {
|
|
// The jump from <last_idx> back to <first_idx> is the last part of a
|
|
// larger, split jump. Ignore.
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Regular, non-split first match jump.
|
|
*is_split = false;
|
|
return true;
|
|
}
|
|
|
|
// See if there's a first match jump chain that starts at <last_idx>
|
|
// and ends up jumping to before <first_idx>.
|
|
for (plan_idx i = first_idx; i < last_idx; ++i) {
|
|
if (qep_tabs[i].firstmatch_return == first_idx - 1 &&
|
|
qep_tabs[i].match_tab == last_idx) {
|
|
// OK, there is. We will be solving it by adding these tables to
|
|
// unhandled_duplicates, so they will be solved with weedout at
|
|
// the very top. Go through all the different segments of the split jump
|
|
// to figure out which tables are supposed to be part of this semijoin.
|
|
plan_idx this_segment_end = MAX_TABLES + 1;
|
|
for (plan_idx j = last_idx; j >= first_idx; --j) {
|
|
if (qep_tabs[j].match_tab == last_idx) {
|
|
this_segment_end = qep_tabs[j].firstmatch_return;
|
|
}
|
|
if (j > this_segment_end) {
|
|
unhandled_duplicates->push_back(&qep_tabs[j]);
|
|
}
|
|
}
|
|
|
|
*is_split = true;
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
enum class Substructure { NONE, OUTER_JOIN, SEMIJOIN, WEEDOUT };
|
|
|
|
/**
|
|
Given a range of tables (where we assume that we've already handled
|
|
first_idx..(this_idx-1) as inner joins), figure out whether this is a
|
|
semijoin, an outer join or a weedout. In general, the outermost structure
|
|
wins; if we are in one of the rare cases where there are e.g. coincident
|
|
outer- and semijoins, we do various forms of conflict resolution:
|
|
|
|
- Unhandled weedouts will add elements to unhandled_duplicates
|
|
(to be handled at the top level of the query).
|
|
- Unhandled semijoins will either:
|
|
* Set add_limit_1 to true, which means a LIMIT 1 iterator should
|
|
be added, or
|
|
* Add elements to unhandled_duplicates in situations that cannot
|
|
be solved by a simple one-table, one-row LIMIT.
|
|
|
|
If not returning NONE, substructure_end will also be filled with where this
|
|
sub-join ends (exclusive).
|
|
*/
|
|
static Substructure FindSubstructure(
|
|
QEP_TAB *qep_tabs, const plan_idx first_idx, const plan_idx this_idx,
|
|
const plan_idx last_idx, CallingContext calling_context, bool *add_limit_1,
|
|
plan_idx *substructure_end, vector<QEP_TAB *> *unhandled_duplicates) {
|
|
QEP_TAB *qep_tab = &qep_tabs[this_idx];
|
|
bool is_outer_join =
|
|
qep_tab->last_inner() != NO_PLAN_IDX && qep_tab->last_inner() < last_idx;
|
|
plan_idx outer_join_end =
|
|
qep_tab->last_inner() + 1; // Only valid if is_outer_join.
|
|
|
|
// See if this table marks the end of the left side of a semijoin.
|
|
bool is_semijoin = false;
|
|
plan_idx semijoin_end = NO_PLAN_IDX;
|
|
for (plan_idx j = this_idx; j < last_idx; ++j) {
|
|
bool is_split;
|
|
if (FirstMatchBetween(qep_tabs, this_idx, j, &is_split,
|
|
unhandled_duplicates)) {
|
|
if (is_split) {
|
|
// Split first-match jumps are fully handled by adding to
|
|
// <unhandled_duplicates>, so don't communicate the semijoin upwards;
|
|
// keep looking for other substructures instead (including smaller
|
|
// semijoins).
|
|
} else {
|
|
is_semijoin = true;
|
|
semijoin_end = j + 1;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Outer joins (or semijoins) wrapping a weedout is tricky,
|
|
// especially in edge cases. If we have an outer join wrapping
|
|
// a weedout, the outer join needs to be processed first.
|
|
// But the weedout wins if it's strictly larger than the outer join.
|
|
// However, a problem occurs if the weedout wraps two consecutive
|
|
// outer joins (which can happen if the join optimizer interleaves
|
|
// tables from different weedouts and needs to combine them into
|
|
// one larger weedout). E.g., consider a join order such as
|
|
//
|
|
// a LEFT JOIN (b,c) LEFT JOIN (d,e)
|
|
//
|
|
// where there is _also_ a weedout wrapping all four tables [b,e].
|
|
// (Presumably, there were originally two weedouts b+e and c+d,
|
|
// but due to reordering, they were combined into one.)
|
|
// In this case, we have a non-hierarchical situation since the
|
|
// (a,(b,c)) join only partially overlaps with the [b,e] weedout.
|
|
//
|
|
// We solve these non-hierarchical cases by punting them upwards;
|
|
// we signal that they are simply not done by adding them to
|
|
// unhandled_duplicates, and then drop the weedout. The top level
|
|
// will then add a final weedout after all joins. In some cases,
|
|
// it is possible to push the weedout further down than this,
|
|
// but these cases are so marginal that it's not worth it.
|
|
|
|
// See if this table starts a weedout operation.
|
|
bool is_weedout = false;
|
|
plan_idx weedout_end = NO_PLAN_IDX;
|
|
if (qep_tab->starts_weedout() &&
|
|
!(calling_context == DIRECTLY_UNDER_WEEDOUT && this_idx == first_idx)) {
|
|
for (plan_idx j = this_idx; j < last_idx; ++j) {
|
|
if (qep_tabs[j].check_weed_out_table == qep_tab->flush_weedout_table) {
|
|
weedout_end = j + 1;
|
|
break;
|
|
}
|
|
}
|
|
if (weedout_end != NO_PLAN_IDX) {
|
|
is_weedout = true;
|
|
}
|
|
}
|
|
|
|
if (weedout_end > last_idx) {
|
|
// See comment above.
|
|
MarkUnhandledDuplicates(qep_tabs, qep_tab->flush_weedout_table, this_idx,
|
|
weedout_end, unhandled_duplicates);
|
|
is_weedout = false;
|
|
}
|
|
|
|
if (is_outer_join && is_weedout) {
|
|
if (outer_join_end > weedout_end) {
|
|
// Weedout will be handled at a lower recursion level.
|
|
is_weedout = false;
|
|
} else {
|
|
// See comment above.
|
|
MarkUnhandledDuplicates(qep_tabs, qep_tab->flush_weedout_table, this_idx,
|
|
weedout_end, unhandled_duplicates);
|
|
is_weedout = false;
|
|
}
|
|
}
|
|
if (is_semijoin && is_weedout) {
|
|
if (semijoin_end > weedout_end) {
|
|
// Weedout will be handled at a lower recursion level.
|
|
is_weedout = false;
|
|
} else {
|
|
// See comment above.
|
|
MarkUnhandledDuplicates(qep_tabs, qep_tab->flush_weedout_table, this_idx,
|
|
weedout_end, unhandled_duplicates);
|
|
is_weedout = false;
|
|
}
|
|
}
|
|
|
|
// Occasionally, a subslice may be designated as the right side of both a
|
|
// semijoin _and_ an outer join. This is a fairly odd construction,
|
|
// as it means exactly one row is generated no matter what (negating the
|
|
// point of a semijoin in the first place), and typically happens as the
|
|
// result of the join optimizer reordering tables that have no real bearing
|
|
// on the query, such as ... WHERE t1 IN ( t2.i FROM t2 LEFT JOIN t3 )
|
|
// with the ordering t2, t1, t3 (t3 will now be in such a situation).
|
|
//
|
|
// Nominally, these tables should be optimized away, but this is not the
|
|
// right place for that, so we solve it by adding a LIMIT 1 and then
|
|
// treating the slice as a normal outer join.
|
|
*add_limit_1 = false;
|
|
if (is_semijoin && is_outer_join) {
|
|
if (semijoin_end == outer_join_end) {
|
|
*add_limit_1 = true;
|
|
is_semijoin = false;
|
|
} else if (semijoin_end > outer_join_end) {
|
|
// A special case of the special case; there might be more than one
|
|
// outer join contained in this semijoin, e.g. A LEFT JOIN B LEFT JOIN C
|
|
// where the combination B-C is _also_ the right side of a semijoin.
|
|
// This forms a non-hierarchical structure and should be exceedingly rare,
|
|
// so we handle it the same way we handle non-hierarchical weedout above,
|
|
// ie., just by removing the added duplicates at the top of the query.
|
|
for (plan_idx i = this_idx; i < semijoin_end; ++i) {
|
|
unhandled_duplicates->push_back(&qep_tabs[i]);
|
|
}
|
|
is_semijoin = false;
|
|
}
|
|
}
|
|
|
|
// Yet another special case like the above; this is when we have a semijoin
|
|
// and then a partially overlapping outer join that ends outside the semijoin.
|
|
// E.g., A JOIN B JOIN C LEFT JOIN D, where A..C denotes a semijoin
|
|
// (C has first match back to A).
|
|
if (is_semijoin) {
|
|
for (plan_idx i = this_idx; i < semijoin_end; ++i) {
|
|
if (qep_tabs[i].last_inner() >= semijoin_end) {
|
|
// Handle this semijoin as non-hierarchical weedout above.
|
|
for (plan_idx j = this_idx; j < semijoin_end; ++j) {
|
|
unhandled_duplicates->push_back(&qep_tabs[j]);
|
|
}
|
|
is_semijoin = false;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// We may have detected both a semijoin and an outer join starting at
|
|
// this table. Decide which one is the outermost that is not already
|
|
// processed, so that we recurse in the right order.
|
|
if (calling_context == DIRECTLY_UNDER_SEMIJOIN && this_idx == first_idx &&
|
|
semijoin_end == last_idx) {
|
|
is_semijoin = false;
|
|
} else if (calling_context == DIRECTLY_UNDER_OUTER_JOIN &&
|
|
this_idx == first_idx && outer_join_end == last_idx) {
|
|
is_outer_join = false;
|
|
}
|
|
if (is_semijoin && is_outer_join) {
|
|
DBUG_ASSERT(outer_join_end > semijoin_end);
|
|
is_semijoin = false;
|
|
}
|
|
|
|
DBUG_ASSERT(is_semijoin + is_outer_join + is_weedout <= 1);
|
|
|
|
if (is_semijoin) {
|
|
*substructure_end = semijoin_end;
|
|
return Substructure::SEMIJOIN;
|
|
} else if (is_outer_join) {
|
|
*substructure_end = outer_join_end;
|
|
return Substructure::OUTER_JOIN;
|
|
} else if (is_weedout) {
|
|
*substructure_end = weedout_end;
|
|
return Substructure::WEEDOUT;
|
|
} else {
|
|
*substructure_end = NO_PLAN_IDX; // Not used.
|
|
return Substructure::NONE;
|
|
}
|
|
}
|
|
|
|
/// @cond Doxygen_is_confused
|
|
static unique_ptr_destroy_only<RowIterator> ConnectJoins(
|
|
plan_idx first_idx, plan_idx last_idx, QEP_TAB *qep_tabs, THD *thd,
|
|
CallingContext calling_context,
|
|
vector<PendingCondition> *pending_conditions,
|
|
vector<PendingInvalidator> *pending_invalidators,
|
|
vector<QEP_TAB *> *unhandled_duplicates);
|
|
/// @endcond
|
|
|
|
/**
|
|
Get the RowIterator used for scanning the given table, with any required
|
|
materialization operations done first.
|
|
*/
|
|
unique_ptr_destroy_only<RowIterator> GetTableIterator(
|
|
THD *thd, QEP_TAB *qep_tab, QEP_TAB *qep_tabs,
|
|
vector<PendingCondition> *pending_conditions,
|
|
vector<PendingInvalidator> *pending_invalidators,
|
|
vector<QEP_TAB *> *unhandled_duplicates) {
|
|
unique_ptr_destroy_only<RowIterator> table_iterator;
|
|
if (qep_tab->materialize_table == join_materialize_derived) {
|
|
SELECT_LEX_UNIT *unit = qep_tab->table_ref->derived_unit();
|
|
JOIN *subjoin = nullptr;
|
|
Temp_table_param *tmp_table_param;
|
|
int select_number;
|
|
|
|
// If we have a single query block at the end of the QEP_TAB array,
|
|
// it may contain aggregation that have already set up fields and items
|
|
// to copy, and we need to pass those to MaterializeIterator, so reuse its
|
|
// tmp_table_param. If not, make a new object, so that we don't
|
|
// disturb the materialization going on inside our own query block.
|
|
if (unit->is_simple()) {
|
|
subjoin = unit->first_select()->join;
|
|
tmp_table_param = &unit->first_select()->join->tmp_table_param;
|
|
select_number = subjoin->select_lex->select_number;
|
|
} else if (unit->fake_select_lex != nullptr) {
|
|
// NOTE: subjoin here is never used, as ConvertItemsToCopy only uses it
|
|
// for ROLLUP, and fake_select_lex can't have ROLLUP.
|
|
subjoin = unit->fake_select_lex->join;
|
|
tmp_table_param = &unit->fake_select_lex->join->tmp_table_param;
|
|
select_number = unit->fake_select_lex->select_number;
|
|
} else {
|
|
tmp_table_param = new (thd->mem_root) Temp_table_param;
|
|
select_number = unit->first_select()->select_number;
|
|
}
|
|
ConvertItemsToCopy(unit->get_field_list(),
|
|
qep_tab->table()->visible_field_ptr(), tmp_table_param,
|
|
subjoin);
|
|
bool copy_fields_and_items_in_materialize = true;
|
|
if (unit->is_simple()) {
|
|
// See if AggregateIterator already does this for us.
|
|
JOIN *join = unit->first_select()->join;
|
|
copy_fields_and_items_in_materialize =
|
|
!join->streaming_aggregation ||
|
|
join->tmp_table_param.precomputed_group_by;
|
|
}
|
|
if (unit->unfinished_materialization()) {
|
|
// The unit is a UNION capable of materializing directly into our result
|
|
// table. This saves us from doing double materialization (first into
|
|
// a UNION result table, then from there into our own).
|
|
//
|
|
// We will already have set up a unique index on the table if
|
|
// required; see TABLE_LIST::setup_materialized_derived_tmp_table().
|
|
table_iterator = NewIterator<MaterializeIterator>(
|
|
thd, unit->release_query_blocks_to_materialize(), qep_tab->table(),
|
|
move(qep_tab->iterator), qep_tab->table_ref->common_table_expr(),
|
|
unit, /*subjoin=*/nullptr,
|
|
/*ref_slice=*/-1, qep_tab->rematerialize,
|
|
tmp_table_param->end_write_records);
|
|
} else if (qep_tab->table_ref->common_table_expr() == nullptr &&
|
|
qep_tab->rematerialize && qep_tab->using_table_scan()) {
|
|
// We don't actually need the materialization for anything (we would
|
|
// just reading the rows straight out from the table, never to be used
|
|
// again), so we can just stream records directly over to the next
|
|
// iterator. This saves both CPU time and memory (for the temporary
|
|
// table).
|
|
//
|
|
// NOTE: Currently, qep_tab->rematerialize is true only for JSON_TABLE.
|
|
// We could extend this to other situations, such as the leftmost
|
|
// table of the join (assuming nested loop only). The test for CTEs is
|
|
// also conservative; if the CTEs is defined within this join and used
|
|
// only once, we could still stream without losing performance.
|
|
table_iterator = NewIterator<StreamingIterator>(
|
|
thd, unit->release_root_iterator(), &subjoin->tmp_table_param,
|
|
qep_tab->table(), copy_fields_and_items_in_materialize);
|
|
} else {
|
|
table_iterator = NewIterator<MaterializeIterator>(
|
|
thd, unit->release_root_iterator(), tmp_table_param, qep_tab->table(),
|
|
move(qep_tab->iterator), qep_tab->table_ref->common_table_expr(),
|
|
select_number, unit, /*subjoin=*/nullptr,
|
|
/*ref_slice=*/-1, copy_fields_and_items_in_materialize,
|
|
qep_tab->rematerialize, tmp_table_param->end_write_records);
|
|
}
|
|
|
|
if (!qep_tab->rematerialize) {
|
|
MaterializeIterator *materialize =
|
|
down_cast<MaterializeIterator *>(table_iterator->real_iterator());
|
|
if (qep_tab->invalidators != nullptr) {
|
|
for (const CacheInvalidatorIterator *iterator :
|
|
*qep_tab->invalidators) {
|
|
materialize->AddInvalidator(iterator);
|
|
}
|
|
}
|
|
}
|
|
} else if (qep_tab->materialize_table == join_materialize_table_function) {
|
|
table_iterator = NewIterator<MaterializedTableFunctionIterator>(
|
|
thd, qep_tab->table_ref->table_function, qep_tab->table(),
|
|
move(qep_tab->iterator));
|
|
} else if (qep_tab->materialize_table == join_materialize_semijoin) {
|
|
Semijoin_mat_exec *sjm = qep_tab->sj_mat_exec();
|
|
|
|
// create_tmp_table() has already filled sjm->table_param.items_to_copy.
|
|
// However, the structures there are not used by
|
|
// join_materialize_semijoin, and don't have e.g. result fields set up
|
|
// correctly, so we just clear it and create our own.
|
|
sjm->table_param.items_to_copy = nullptr;
|
|
ConvertItemsToCopy(&sjm->sj_nest->nested_join->sj_inner_exprs,
|
|
qep_tab->table()->visible_field_ptr(), &sjm->table_param,
|
|
qep_tab->join());
|
|
|
|
int join_start = sjm->inner_table_index;
|
|
int join_end = join_start + sjm->table_count;
|
|
unique_ptr_destroy_only<RowIterator> subtree_iterator = ConnectJoins(
|
|
join_start, join_end, qep_tabs, thd, TOP_LEVEL, pending_conditions,
|
|
pending_invalidators, unhandled_duplicates);
|
|
|
|
// Since materialized semijoins are based on ref access against the table,
|
|
// and ref access has NULL = NULL (while IN expressions should not),
|
|
// remove rows with NULLs in them here. This is only an optimization for IN
|
|
// (since equality propagation will filter away NULLs on the other side),
|
|
// but is required for NOT IN correctness.
|
|
//
|
|
// TODO: It could be possible to join this with an existing condition,
|
|
// and possibly also in some cases when scanning each table.
|
|
vector<Item *> not_null_conditions;
|
|
for (Item &item : sjm->sj_nest->nested_join->sj_inner_exprs) {
|
|
if (item.maybe_null) {
|
|
Item *condition = new Item_func_isnotnull(&item);
|
|
condition->quick_fix_field();
|
|
condition->update_used_tables();
|
|
condition->apply_is_true();
|
|
not_null_conditions.push_back(condition);
|
|
}
|
|
}
|
|
subtree_iterator = PossiblyAttachFilterIterator(move(subtree_iterator),
|
|
not_null_conditions, thd);
|
|
|
|
bool copy_fields_and_items_in_materialize =
|
|
true; // We never have aggregation within semijoins.
|
|
table_iterator = NewIterator<MaterializeIterator>(
|
|
thd, move(subtree_iterator), &sjm->table_param, qep_tab->table(),
|
|
move(qep_tab->iterator), /*cte=*/nullptr,
|
|
qep_tab->join()->select_lex->select_number, qep_tab->join()->unit,
|
|
qep_tab->join(),
|
|
/*ref_slice=*/-1, copy_fields_and_items_in_materialize,
|
|
qep_tab->rematerialize, sjm->table_param.end_write_records);
|
|
|
|
#ifndef DBUG_OFF
|
|
// Make sure we clear this table out when the join is reset,
|
|
// since its contents may depend on outer expressions.
|
|
bool found = false;
|
|
for (TABLE &sj_tmp_tab : qep_tab->join()->sj_tmp_tables) {
|
|
if (&sj_tmp_tab == qep_tab->table()) {
|
|
found = true;
|
|
break;
|
|
}
|
|
}
|
|
DBUG_ASSERT(found);
|
|
#endif
|
|
} else {
|
|
table_iterator = move(qep_tab->iterator);
|
|
|
|
POSITION *pos = qep_tab->position();
|
|
if (pos != nullptr) {
|
|
SetCostOnTableIterator(*thd->cost_model(), pos, /*is_after_filter=*/false,
|
|
table_iterator.get());
|
|
}
|
|
|
|
// See if this is an information schema table that must be filled in before
|
|
// we scan.
|
|
if (qep_tab->table_ref->schema_table &&
|
|
qep_tab->table_ref->schema_table->fill_table) {
|
|
table_iterator.reset(new (thd->mem_root)
|
|
MaterializeInformationSchemaTableIterator(
|
|
thd, qep_tab, move(table_iterator)));
|
|
}
|
|
}
|
|
return table_iterator;
|
|
}
|
|
|
|
void SetCostOnTableIterator(const Cost_model_server &cost_model,
|
|
const POSITION *pos, bool is_after_filter,
|
|
RowIterator *iterator) {
|
|
double num_rows_after_filtering = pos->rows_fetched * pos->filter_effect;
|
|
if (is_after_filter) {
|
|
iterator->set_expected_rows(num_rows_after_filtering);
|
|
} else {
|
|
iterator->set_expected_rows(pos->rows_fetched);
|
|
}
|
|
|
|
// Note that we don't try to adjust for the filtering here;
|
|
// we estimate the same cost as the table itself.
|
|
double cost =
|
|
pos->read_cost + cost_model.row_evaluate_cost(num_rows_after_filtering);
|
|
if (pos->prefix_rowcount <= 0.0) {
|
|
iterator->set_estimated_cost(cost);
|
|
} else {
|
|
// Scale the estimated cost to being for one loop only, to match the
|
|
// measured costs.
|
|
iterator->set_estimated_cost(cost * num_rows_after_filtering /
|
|
pos->prefix_rowcount);
|
|
}
|
|
}
|
|
|
|
void SetCostOnNestedLoopIterator(const Cost_model_server &cost_model,
|
|
const POSITION *pos_right,
|
|
RowIterator *iterator) {
|
|
if (pos_right == nullptr) {
|
|
// No cost information.
|
|
return;
|
|
}
|
|
|
|
DBUG_ASSERT(iterator->children().size() == 2);
|
|
RowIterator *left = iterator->children()[0].iterator;
|
|
RowIterator *right = iterator->children()[1].iterator;
|
|
|
|
if (left->expected_rows() == -1.0 || right->expected_rows() == -1.0) {
|
|
// Missing cost information on at least one child.
|
|
return;
|
|
}
|
|
|
|
// Mirrors set_prefix_join_cost(), even though the cost calculation doesn't
|
|
// make a lot of sense.
|
|
double right_expected_rows_before_filter =
|
|
pos_right->filter_effect > 0.0
|
|
? (right->expected_rows() / pos_right->filter_effect)
|
|
: 0.0;
|
|
double joined_rows =
|
|
left->expected_rows() * right_expected_rows_before_filter;
|
|
iterator->set_expected_rows(joined_rows * pos_right->filter_effect);
|
|
iterator->set_estimated_cost(left->estimated_cost() + pos_right->read_cost +
|
|
cost_model.row_evaluate_cost(joined_rows));
|
|
}
|
|
|
|
void SetCostOnHashJoinIterator(const Cost_model_server &cost_model,
|
|
const POSITION *pos_right,
|
|
RowIterator *iterator) {
|
|
if (pos_right == nullptr) {
|
|
// No cost information.
|
|
return;
|
|
}
|
|
|
|
DBUG_ASSERT(iterator->children().size() == 2);
|
|
RowIterator *left = iterator->children()[1].iterator;
|
|
RowIterator *right = iterator->children()[0].iterator;
|
|
|
|
if (left->expected_rows() == -1.0 || right->expected_rows() == -1.0) {
|
|
// Missing cost information on at least one child.
|
|
return;
|
|
}
|
|
|
|
// Mirrors set_prefix_join_cost(), even though the cost calculation doesn't
|
|
// make a lot of sense.
|
|
double joined_rows = left->expected_rows() * right->expected_rows();
|
|
iterator->set_expected_rows(joined_rows * pos_right->filter_effect);
|
|
iterator->set_estimated_cost(left->estimated_cost() + pos_right->read_cost +
|
|
cost_model.row_evaluate_cost(joined_rows));
|
|
}
|
|
|
|
// Move all the hash join conditions from the vector "predicates" over to the
|
|
// vector "hash_join_conditions". Only join conditions that are suitable for
|
|
// hash join are moved. If there are any condition that has to be evaluated
|
|
// after the join (i.e. non equi-join conditions), they are placed in the vector
|
|
// "conditions_after_hash_join" so that they can be attached as filters after
|
|
// the join.
|
|
static void ExtractHashJoinConditions(
|
|
const QEP_TAB *current_table, const std::vector<QEP_TAB *> &left_tables,
|
|
vector<Item *> *predicates, vector<Item_func_eq *> *hash_join_conditions,
|
|
vector<Item *> *conditions_after_hash_join) {
|
|
table_map left_tables_map = 0;
|
|
for (QEP_TAB *qep_tab : left_tables) {
|
|
left_tables_map = left_tables_map | qep_tab->table_ref->map();
|
|
}
|
|
|
|
for (Item *item : *predicates) {
|
|
if (item->type() != Item::FUNC_ITEM) {
|
|
continue;
|
|
}
|
|
|
|
Item_func *func_item = down_cast<Item_func *>(item);
|
|
if (func_item->functype() != Item_func::EQ_FUNC) {
|
|
continue;
|
|
}
|
|
|
|
Item_func_eq *item_func_eq = down_cast<Item_func_eq *>(func_item);
|
|
if (item_func_eq->has_any_hash_join_condition(left_tables_map,
|
|
*current_table)) {
|
|
hash_join_conditions->emplace_back(item_func_eq);
|
|
}
|
|
}
|
|
|
|
// Remove all hash join conditions from the vector "predicates".
|
|
predicates->erase(remove_if(predicates->begin(), predicates->end(),
|
|
[&hash_join_conditions](const Item *item) {
|
|
return find(hash_join_conditions->begin(),
|
|
hash_join_conditions->end(),
|
|
item) !=
|
|
hash_join_conditions->end();
|
|
}),
|
|
predicates->end());
|
|
|
|
// See if any of the remaining conditions should be attached as filter after
|
|
// the join. If so, place them in a separate vector.
|
|
for (int i = predicates->size() - 1; i >= 0; --i) {
|
|
Item *item = predicates->at(i);
|
|
table_map used_tables = item->used_tables();
|
|
if ((~current_table->table_ref->map() & used_tables) > 0) {
|
|
conditions_after_hash_join->emplace_back(item);
|
|
predicates->erase(predicates->begin() + i);
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
For a given slice of the table list, build up the iterator tree corresponding
|
|
to the tables in that slice. It handles inner and outer joins, as well as
|
|
semijoins (“first match”).
|
|
|
|
The join tree in MySQL is generally a left-deep tree of inner joins,
|
|
so we can start at the left, make an inner join against the next table,
|
|
join the result of that against the next table, etc.. However, a given
|
|
sub-slice of the table list can be designated as an outer join, by setting
|
|
first_inner() and last_inner() on the first table of said slice. (It is also
|
|
set in some, but not all, of the other tables in the slice.) If so, we call
|
|
ourselves recursively with that slice, put it as the right (inner) arm of
|
|
an outer join, and then continue with our inner join.
|
|
|
|
Similarly, if a table N has set “first match” to table M (ie., jump back to
|
|
table M whenever we see a non-filtered record in table N), then there is a
|
|
subslice from [M+1,N] that we need to process recursively before putting it
|
|
as the right side of a semijoin. Every semijoin can be implemented with a
|
|
LIMIT 1, but for clarity and performance, we prefer to use a NestedLoopJoin
|
|
with a special SEMI join type whenever possible. Sometimes, we have no choice,
|
|
though (see the comments below). Note that we cannot use first_sj_inner() for
|
|
detecting semijoins, as it is not updated when tables are reordered by the
|
|
join optimizer. Outer joins and semijoins can nest, so we need to take some
|
|
care to make sure that we pick the outermost structure to recurse on.
|
|
|
|
Conditions are a bit tricky. Conceptually, SQL evaluates conditions only
|
|
after all tables have been joined; however, for efficiency reasons, we want
|
|
to evaluate them as early as possible. As long as we are only dealing with
|
|
inner joins, this is as soon as we've read all tables participating in the
|
|
condition, but for outer joins, we need to wait until the join has happened.
|
|
See pending_conditions below.
|
|
|
|
@param first_idx index of the first table in the slice we are creating a
|
|
tree for (inclusive)
|
|
@param last_idx index of the last table in the slice we are creating a
|
|
tree for (exclusive)
|
|
@param qep_tabs the full list of tables we are joining
|
|
@param thd the THD to allocate the iterators on
|
|
@param calling_context what situation we have immediately around is in the
|
|
tree (ie., whether we are called to resolve the inner part of an outer
|
|
join, a semijoin, etc.); mostly used to avoid infinite recursion where we
|
|
would process e.g. the same semijoin over and over again
|
|
@param pending_conditions if nullptr, we are not at the right (inner) side of
|
|
any outer join and can evaluate conditions immediately. If not, we need to
|
|
push any WHERE predicates to that vector and evaluate them only after joins.
|
|
@param pending_invalidators similar to pending_conditions, but for tables
|
|
that should have a CacheInvalidatorIterator synthesized for them;
|
|
NULL-complemented rows must also invalidate materialized lateral derived
|
|
tables.
|
|
@param[out] unhandled_duplicates list of tables we should have deduplicated
|
|
using duplicate weedout, but could not; append-only.
|
|
*/
|
|
static unique_ptr_destroy_only<RowIterator> ConnectJoins(
|
|
plan_idx first_idx, plan_idx last_idx, QEP_TAB *qep_tabs, THD *thd,
|
|
CallingContext calling_context,
|
|
vector<PendingCondition> *pending_conditions,
|
|
vector<PendingInvalidator> *pending_invalidators,
|
|
vector<QEP_TAB *> *unhandled_duplicates) {
|
|
DBUG_ASSERT(last_idx > first_idx);
|
|
DBUG_ASSERT((pending_conditions == nullptr) ==
|
|
(pending_invalidators == nullptr));
|
|
unique_ptr_destroy_only<RowIterator> iterator = nullptr;
|
|
|
|
// A special case: If we are at the top but the first table is an outer
|
|
// join, we implicitly have one or more const tables to the left side
|
|
// of said join.
|
|
bool is_top_level_outer_join =
|
|
calling_context == TOP_LEVEL &&
|
|
qep_tabs[first_idx].last_inner() != NO_PLAN_IDX;
|
|
|
|
vector<PendingCondition> top_level_pending_conditions;
|
|
vector<PendingInvalidator> top_level_pending_invalidators;
|
|
if (is_top_level_outer_join) {
|
|
iterator =
|
|
NewIterator<FakeSingleRowIterator>(thd, /*examined_rows=*/nullptr);
|
|
pending_conditions = &top_level_pending_conditions;
|
|
pending_invalidators = &top_level_pending_invalidators;
|
|
}
|
|
|
|
// NOTE: i is advanced in one of two ways:
|
|
//
|
|
// - If we have an inner join, it will be incremented near the bottom of the
|
|
// loop, as we can process inner join tables one by one.
|
|
// - If not (ie., we have an outer join or semijoin), we will process
|
|
// the sub-join recursively, and thus move it past the end of said
|
|
// sub-join.
|
|
for (plan_idx i = first_idx; i < last_idx;) {
|
|
bool add_limit_1;
|
|
plan_idx substructure_end;
|
|
Substructure substructure =
|
|
FindSubstructure(qep_tabs, first_idx, i, last_idx, calling_context,
|
|
&add_limit_1, &substructure_end, unhandled_duplicates);
|
|
|
|
QEP_TAB *qep_tab = &qep_tabs[i];
|
|
if (substructure == Substructure::OUTER_JOIN ||
|
|
substructure == Substructure::SEMIJOIN) {
|
|
// Outer or semijoin, consisting of a subtree (possibly of only one
|
|
// table), so we send the entire subtree down to a recursive invocation
|
|
// and then join the returned root into our existing tree.
|
|
unique_ptr_destroy_only<RowIterator> subtree_iterator;
|
|
vector<PendingCondition> subtree_pending_conditions;
|
|
vector<PendingInvalidator> subtree_pending_invalidators;
|
|
if (substructure == Substructure::SEMIJOIN) {
|
|
// Semijoins don't have special handling of WHERE, so simply recurse.
|
|
subtree_iterator = ConnectJoins(
|
|
i, substructure_end, qep_tabs, thd, DIRECTLY_UNDER_SEMIJOIN,
|
|
pending_conditions, pending_invalidators, unhandled_duplicates);
|
|
} else if (pending_conditions != nullptr) {
|
|
// We are already on the right (inner) side of an outer join,
|
|
// so we need to keep deferring WHERE predicates.
|
|
subtree_iterator = ConnectJoins(
|
|
i, substructure_end, qep_tabs, thd, DIRECTLY_UNDER_OUTER_JOIN,
|
|
pending_conditions, pending_invalidators, unhandled_duplicates);
|
|
|
|
// Pick out any conditions that should be directly above this join
|
|
// (ie., the ON conditions for this specific join).
|
|
for (auto it = pending_conditions->begin();
|
|
it != pending_conditions->end();) {
|
|
if (it->table_index_to_attach_to == int(i)) {
|
|
subtree_pending_conditions.push_back(*it);
|
|
it = pending_conditions->erase(it);
|
|
} else {
|
|
++it;
|
|
}
|
|
}
|
|
|
|
// Similarly, for invalidators.
|
|
for (auto it = pending_invalidators->begin();
|
|
it != pending_invalidators->end();) {
|
|
if (it->table_index_to_attach_to == int(i)) {
|
|
subtree_pending_invalidators.push_back(*it);
|
|
it = pending_invalidators->erase(it);
|
|
} else {
|
|
++it;
|
|
}
|
|
}
|
|
} else {
|
|
// We can check the WHERE predicates on this table right away
|
|
// after the join (and similarly, set up invalidators).
|
|
subtree_iterator =
|
|
ConnectJoins(i, substructure_end, qep_tabs, thd,
|
|
DIRECTLY_UNDER_OUTER_JOIN, &subtree_pending_conditions,
|
|
&subtree_pending_invalidators, unhandled_duplicates);
|
|
}
|
|
|
|
JoinType join_type;
|
|
if (qep_tab->table()->reginfo.not_exists_optimize) {
|
|
// Similar to the comment on SplitConditions (see case #3), we can only
|
|
// enable anti-join optimizations if we are not already on the right
|
|
// (inner) side of another outer join. Otherwise, we would cause the
|
|
// higher-up outer join to create NULL rows where there should be none.
|
|
DBUG_ASSERT(substructure != Substructure::SEMIJOIN);
|
|
join_type =
|
|
(pending_conditions == nullptr) ? JoinType::ANTI : JoinType::OUTER;
|
|
|
|
// Normally, a ”found” trigger means that the condition should be moved
|
|
// up above some outer join (ie., it's a WHERE, not an ON condition).
|
|
// However, there is one specific case where the optimizer sets up such
|
|
// a trigger with the condition being _the same table as it's posted
|
|
// on_, namely anti-joins used for NOT IN; here, a FALSE condition is
|
|
// being used to specify that inner rows should pass by the join, but
|
|
// they should inhibit the null-complemented row. (So in this case,
|
|
// the anti-join is no longer just an optimization that can be ignored
|
|
// as we rewrite into an outer join.) In this case, there's a condition
|
|
// wrapped in “not_null_compl” and ”found”, with the trigger for both
|
|
// being the same table as the condition is posted on.
|
|
//
|
|
// So, as a special exception, detect this case, removing these
|
|
// conditions (as they would otherwise kill all of our output rows) and
|
|
// use them to mark the join as _really_ anti-join, even when it's
|
|
// within an outer join.
|
|
for (auto it = subtree_pending_conditions.begin();
|
|
it != subtree_pending_conditions.end();) {
|
|
if (it->table_index_to_attach_to == int(i) &&
|
|
it->cond->item_name.ptr() == antijoin_null_cond) {
|
|
DBUG_ASSERT(nullptr != dynamic_cast<Item_func_false *>(it->cond));
|
|
join_type = JoinType::ANTI;
|
|
it = subtree_pending_conditions.erase(it);
|
|
} else {
|
|
++it;
|
|
}
|
|
}
|
|
} else {
|
|
join_type = substructure == Substructure::SEMIJOIN ? JoinType::SEMI
|
|
: JoinType::OUTER;
|
|
}
|
|
|
|
// If the entire slice is a semijoin (e.g. because we are semijoined
|
|
// against all the const tables, or because we're a semijoin within an
|
|
// outer join), solve it by using LIMIT 1.
|
|
//
|
|
// If the entire slice is an outer join, we've solved that in a more
|
|
// roundabout way; see is_top_level_outer_join above.
|
|
if (iterator == nullptr) {
|
|
DBUG_ASSERT(substructure == Substructure::SEMIJOIN);
|
|
add_limit_1 = true;
|
|
}
|
|
|
|
if (add_limit_1) {
|
|
subtree_iterator = NewIterator<LimitOffsetIterator>(
|
|
thd, move(subtree_iterator), /*limit=*/1, /*offset=*/0,
|
|
/*count_all_rows=*/false, /*skipped_rows=*/nullptr);
|
|
}
|
|
|
|
const bool pfs_batch_mode = qep_tab->pfs_batch_update(qep_tab->join()) &&
|
|
join_type != JoinType::ANTI &&
|
|
join_type != JoinType::SEMI;
|
|
bool remove_duplicates_loose_scan = false;
|
|
if (i != first_idx && qep_tabs[i - 1].do_loosescan() &&
|
|
qep_tabs[i - 1].match_tab != i - 1) {
|
|
QEP_TAB *prev_qep_tab = &qep_tabs[i - 1];
|
|
DBUG_ASSERT(iterator != nullptr);
|
|
|
|
KEY *key = prev_qep_tab->table()->key_info + prev_qep_tab->index();
|
|
if (substructure == Substructure::SEMIJOIN) {
|
|
iterator =
|
|
NewIterator<NestedLoopSemiJoinWithDuplicateRemovalIterator>(
|
|
thd, move(iterator), move(subtree_iterator),
|
|
prev_qep_tab->table(), key, prev_qep_tab->loosescan_key_len);
|
|
SetCostOnNestedLoopIterator(*thd->cost_model(), qep_tab->position(),
|
|
iterator.get());
|
|
} else {
|
|
// We were originally in a semijoin, even if it didn't win in
|
|
// FindSubstructure (LooseScan against multiple tables always puts
|
|
// the non-first tables in FirstMatch), it was just overridden by
|
|
// the outer join. In this case, we put duplicate removal after the
|
|
// join (and any associated filtering), which is the safe option --
|
|
// and in this case, it's no slower, since we'll be having a LIMIT 1
|
|
// inserted anyway.
|
|
DBUG_ASSERT(substructure == Substructure::OUTER_JOIN);
|
|
remove_duplicates_loose_scan = true;
|
|
|
|
iterator = NewIterator<NestedLoopIterator>(thd, move(iterator),
|
|
move(subtree_iterator),
|
|
join_type, pfs_batch_mode);
|
|
SetCostOnNestedLoopIterator(*thd->cost_model(), qep_tab->position(),
|
|
iterator.get());
|
|
}
|
|
} else if (iterator == nullptr) {
|
|
DBUG_ASSERT(substructure == Substructure::SEMIJOIN);
|
|
iterator = move(subtree_iterator);
|
|
} else {
|
|
iterator = NewIterator<NestedLoopIterator>(thd, move(iterator),
|
|
move(subtree_iterator),
|
|
join_type, pfs_batch_mode);
|
|
SetCostOnNestedLoopIterator(*thd->cost_model(), qep_tab->position(),
|
|
iterator.get());
|
|
}
|
|
|
|
iterator = PossiblyAttachFilterIterator(move(iterator),
|
|
subtree_pending_conditions, thd);
|
|
|
|
if (remove_duplicates_loose_scan) {
|
|
QEP_TAB *prev_qep_tab = &qep_tabs[i - 1];
|
|
KEY *key = prev_qep_tab->table()->key_info + prev_qep_tab->index();
|
|
iterator = NewIterator<RemoveDuplicatesIterator>(
|
|
thd, move(iterator), prev_qep_tab->table(), key,
|
|
prev_qep_tab->loosescan_key_len);
|
|
}
|
|
|
|
// It's highly unlikely that we have more than one pending QEP_TAB here
|
|
// (the most common case will be zero), so don't bother combining them
|
|
// into one invalidator.
|
|
for (const PendingInvalidator &invalidator :
|
|
subtree_pending_invalidators) {
|
|
iterator =
|
|
CreateInvalidatorIterator(thd, invalidator.qep_tab, move(iterator));
|
|
}
|
|
|
|
i = substructure_end;
|
|
continue;
|
|
} else if (substructure == Substructure::WEEDOUT) {
|
|
unique_ptr_destroy_only<RowIterator> subtree_iterator = ConnectJoins(
|
|
i, substructure_end, qep_tabs, thd, DIRECTLY_UNDER_WEEDOUT,
|
|
pending_conditions, pending_invalidators, unhandled_duplicates);
|
|
RowIterator *child_iterator = subtree_iterator.get();
|
|
subtree_iterator = CreateWeedoutIterator(thd, move(subtree_iterator),
|
|
qep_tab->flush_weedout_table);
|
|
|
|
// Copy costs (even though it makes no sense for the LIMIT 1 case).
|
|
subtree_iterator->set_expected_rows(child_iterator->expected_rows());
|
|
subtree_iterator->set_estimated_cost(child_iterator->estimated_cost());
|
|
|
|
if (iterator == nullptr) {
|
|
iterator = move(subtree_iterator);
|
|
} else {
|
|
iterator = NewIterator<NestedLoopIterator>(
|
|
thd, move(iterator), move(subtree_iterator), JoinType::INNER,
|
|
/*pfs_batch_mode=*/false);
|
|
SetCostOnNestedLoopIterator(*thd->cost_model(), qep_tab->position(),
|
|
iterator.get());
|
|
}
|
|
|
|
i = substructure_end;
|
|
continue;
|
|
}
|
|
|
|
unique_ptr_destroy_only<RowIterator> table_iterator =
|
|
GetTableIterator(thd, qep_tab, qep_tabs, pending_conditions,
|
|
pending_invalidators, unhandled_duplicates);
|
|
|
|
vector<Item *> predicates_below_join;
|
|
vector<Item_func_eq *> hash_join_conditions;
|
|
vector<Item *> conditions_after_hash_join;
|
|
vector<PendingCondition> predicates_above_join;
|
|
SplitConditions(qep_tab->condition(), &predicates_below_join,
|
|
&predicates_above_join);
|
|
|
|
vector<QEP_TAB *> left_tables;
|
|
|
|
// If this is a BNL, we should replace it with hash join. We did decide
|
|
// during create_iterators that we actually can replace the BNL with a hash
|
|
// join, so we don't bother checking any further that we actually can
|
|
// replace the BNL with a hash join.
|
|
const bool replace_with_hash_join =
|
|
qep_tab->op != nullptr &&
|
|
qep_tab->op->type() == QEP_operation::OT_CACHE;
|
|
|
|
if (replace_with_hash_join) {
|
|
// Get the left tables of this join.
|
|
for (plan_idx j = first_idx; j < i; ++j) {
|
|
left_tables.push_back(&qep_tabs[j]);
|
|
}
|
|
|
|
// All join conditions are now contained in "predicates_below_join". We
|
|
// will now take all the hash join conditions (equi-join conditions) and
|
|
// move them to a separate vector so we can attach them to the hash join
|
|
// iterator later. Also, "predicates_below_join" might contain conditions
|
|
// that should be applied after the join (for instance non equi-join
|
|
// conditions). Put them in a separate vector, and attach them as a filter
|
|
// after the hash join.
|
|
ExtractHashJoinConditions(qep_tab, left_tables, &predicates_below_join,
|
|
&hash_join_conditions,
|
|
&conditions_after_hash_join);
|
|
}
|
|
|
|
if (!qep_tab->condition_is_pushed_to_sort()) { // See the comment on #2.
|
|
double expected_rows = table_iterator->expected_rows();
|
|
table_iterator = PossiblyAttachFilterIterator(move(table_iterator),
|
|
predicates_below_join, thd);
|
|
POSITION *pos = qep_tab->position();
|
|
if (expected_rows >= 0.0 && !predicates_below_join.empty() &&
|
|
pos != nullptr) {
|
|
SetCostOnTableIterator(*thd->cost_model(), pos,
|
|
/*is_after_filter=*/true, table_iterator.get());
|
|
}
|
|
}
|
|
|
|
// Handle LooseScan that hits this specific table only.
|
|
// Multi-table LooseScans will be handled by
|
|
// NestedLoopSemiJoinWithDuplicateRemovalIterator
|
|
// (which is essentially a semijoin NestedLoopIterator and
|
|
// RemoveDuplicatesIterator in one).
|
|
if (qep_tab->do_loosescan() && qep_tab->match_tab == i) {
|
|
KEY *key = qep_tab->table()->key_info + qep_tab->index();
|
|
table_iterator = NewIterator<RemoveDuplicatesIterator>(
|
|
thd, move(table_iterator), qep_tab->table(), key,
|
|
qep_tab->loosescan_key_len);
|
|
}
|
|
|
|
if (qep_tab->lateral_derived_tables_depend_on_me) {
|
|
if (pending_invalidators != nullptr) {
|
|
pending_invalidators->push_back(
|
|
PendingInvalidator{qep_tab, /*table_index_to_attach_to=*/i});
|
|
} else {
|
|
table_iterator =
|
|
CreateInvalidatorIterator(thd, qep_tab, move(table_iterator));
|
|
}
|
|
}
|
|
|
|
if (iterator == nullptr) {
|
|
// We are the first table in this join.
|
|
iterator = move(table_iterator);
|
|
} else {
|
|
// We can only enable DISTINCT optimizations if we are not in the right
|
|
// (inner) side of an outer join; since the filter is deferred, the limit
|
|
// would have to be, too. Similarly, we the old executor can do these
|
|
// optimizations for multiple tables, but it requires poking into global
|
|
// state to see if later tables produced rows or not; we restrict
|
|
// ourselves to the rightmost table, instead of trying to make iterators
|
|
// look at nonlocal state.
|
|
//
|
|
// We don't lose correctness by not applying the limit, only performance
|
|
// on some fairly rare queries (for for former: DISTINCT queries where we
|
|
// outer-join in a table that we don't use in the select list, but filter
|
|
// on one of the columns; for the latter: queries with multiple unused
|
|
// tables).
|
|
//
|
|
// Note that if we are to attach a hash join iterator, we cannot add this
|
|
// optimization as it would limit the probe input to only one row before
|
|
// the join condition is even applied.
|
|
//
|
|
// TODO: Consider pushing this limit up the tree together with the filter.
|
|
// Note that this would require some trickery to reset the filter for
|
|
// each new row on the left side of the join, so it's probably not worth
|
|
// it.
|
|
if (qep_tab->not_used_in_distinct && pending_conditions == nullptr &&
|
|
i == static_cast<plan_idx>(qep_tab->join()->primary_tables - 1) &&
|
|
!add_limit_1 && !replace_with_hash_join) {
|
|
table_iterator = NewIterator<LimitOffsetIterator>(
|
|
thd, move(table_iterator), /*limit=*/1, /*offset=*/0,
|
|
/*count_all_rows=*/false, /*skipped_rows=*/nullptr);
|
|
}
|
|
|
|
// Inner join this table to the existing tree.
|
|
// Inner joins are always left-deep, so we can just attach the tables as
|
|
// we find them.
|
|
DBUG_ASSERT(qep_tab->last_inner() == NO_PLAN_IDX);
|
|
|
|
if (replace_with_hash_join) {
|
|
const bool has_grouping =
|
|
qep_tab->join()->implicit_grouping || qep_tab->join()->grouped;
|
|
|
|
const bool has_limit = qep_tab->join()->m_select_limit != HA_POS_ERROR;
|
|
|
|
const bool has_order_by = qep_tab->join()->order.order != nullptr;
|
|
|
|
// If we have a limit in the query, do not allow hash join to spill to
|
|
// disk. The effect of this is that hash join will start producing
|
|
// result rows a lot earlier, and thus hit the LIMIT a lot sooner.
|
|
// Ideally, this should be decided during optimization.
|
|
// There are however two situations where we always allow spill to disk,
|
|
// and that is if we either have grouping or sorting in the query. In
|
|
// those cases, the iterator above us will most likely consume the
|
|
// entire result set anyways.
|
|
const bool allow_spill_to_disk =
|
|
!has_limit || has_grouping || has_order_by;
|
|
|
|
// The numerically lower QEP_TAB is often (if not always) the smaller
|
|
// input, so use that as the build input.
|
|
iterator = NewIterator<HashJoinIterator>(
|
|
thd, move(iterator), left_tables, move(table_iterator), qep_tab,
|
|
thd->variables.join_buff_size, hash_join_conditions,
|
|
allow_spill_to_disk);
|
|
SetCostOnHashJoinIterator(*thd->cost_model(), qep_tab->position(),
|
|
iterator.get());
|
|
|
|
// Attach the conditions that must be evaluated after the join, such as
|
|
// non equi-join conditions.
|
|
iterator = PossiblyAttachFilterIterator(
|
|
move(iterator), conditions_after_hash_join, thd);
|
|
} else {
|
|
iterator = CreateNestedLoopIterator(
|
|
thd, move(iterator), move(table_iterator), JoinType::INNER,
|
|
qep_tab->pfs_batch_update(qep_tab->join()));
|
|
SetCostOnNestedLoopIterator(*thd->cost_model(), qep_tab->position(),
|
|
iterator.get());
|
|
}
|
|
}
|
|
++i;
|
|
|
|
// If we have any predicates that should be above an outer join,
|
|
// send them upwards.
|
|
for (PendingCondition &cond : predicates_above_join) {
|
|
DBUG_ASSERT(pending_conditions != nullptr);
|
|
pending_conditions->push_back(cond);
|
|
}
|
|
}
|
|
if (is_top_level_outer_join) {
|
|
iterator = PossiblyAttachFilterIterator(move(iterator),
|
|
top_level_pending_conditions, thd);
|
|
|
|
// We can't have any invalidators here, because there's no later table
|
|
// to invalidate.
|
|
DBUG_ASSERT(top_level_pending_invalidators.empty());
|
|
}
|
|
return iterator;
|
|
}
|
|
|
|
void JOIN::create_iterators() {
|
|
DBUG_ASSERT(m_root_iterator == nullptr);
|
|
|
|
// 1) Set up the basic RowIterators for accessing each specific table.
|
|
// This is needed even if we run in pre-iterator executor.
|
|
create_table_iterators();
|
|
|
|
if (select_lex->parent_lex->m_sql_cmd != nullptr &&
|
|
select_lex->parent_lex->m_sql_cmd->using_secondary_storage_engine()) {
|
|
return;
|
|
}
|
|
|
|
// 2) If supported by the implemented iterators, we also create the
|
|
// composite iterators combining the row from each table.
|
|
unique_ptr_destroy_only<RowIterator> iterator =
|
|
create_root_iterator_for_join();
|
|
if (iterator == nullptr) {
|
|
// The query is not supported by the iterator executor.
|
|
DBUG_ASSERT(!select_lex->parent_lex->force_iterator_executor);
|
|
return;
|
|
}
|
|
|
|
iterator = attach_iterators_for_having_and_limit(move(iterator));
|
|
iterator->set_join_for_explain(this);
|
|
m_root_iterator = move(iterator);
|
|
}
|
|
|
|
void JOIN::create_table_iterators() {
|
|
for (unsigned table_idx = const_tables; table_idx < tables; ++table_idx) {
|
|
QEP_TAB *qep_tab = &this->qep_tab[table_idx];
|
|
if (qep_tab->position() == nullptr) {
|
|
continue;
|
|
}
|
|
|
|
/*
|
|
Create the specific RowIterators, including any specific
|
|
RowIterator for the pushed queries.
|
|
*/
|
|
qep_tab->pick_table_access_method();
|
|
|
|
if (qep_tab->filesort) {
|
|
unique_ptr_destroy_only<RowIterator> iterator = move(qep_tab->iterator);
|
|
|
|
// Evaluate any conditions before sorting entire row set.
|
|
if (qep_tab->condition()) {
|
|
vector<Item *> predicates_below_join;
|
|
vector<PendingCondition> predicates_above_join;
|
|
SplitConditions(qep_tab->condition(), &predicates_below_join,
|
|
&predicates_above_join);
|
|
|
|
iterator = PossiblyAttachFilterIterator(move(iterator),
|
|
predicates_below_join, thd);
|
|
qep_tab->mark_condition_as_pushed_to_sort();
|
|
}
|
|
|
|
// Wrap the chosen RowIterator in a SortingIterator, so that we get
|
|
// sorted results out.
|
|
qep_tab->iterator = NewIterator<SortingIterator>(
|
|
qep_tab->join()->thd, qep_tab->filesort, move(iterator),
|
|
&qep_tab->join()->examined_rows);
|
|
qep_tab->table()->sorting_iterator =
|
|
down_cast<SortingIterator *>(qep_tab->iterator->real_iterator());
|
|
}
|
|
}
|
|
}
|
|
|
|
unique_ptr_destroy_only<RowIterator> JOIN::create_root_iterator_for_join() {
|
|
if (select_count) {
|
|
return unique_ptr_destroy_only<RowIterator>(
|
|
new (thd->mem_root) UnqualifiedCountIterator(thd, this));
|
|
}
|
|
|
|
struct MaterializeOperation {
|
|
QEP_TAB *temporary_qep_tab;
|
|
enum {
|
|
MATERIALIZE,
|
|
AGGREGATE_THEN_MATERIALIZE,
|
|
AGGREGATE_INTO_TMP_TABLE,
|
|
WINDOWING_FUNCTION
|
|
} type;
|
|
};
|
|
vector<MaterializeOperation> final_materializations;
|
|
|
|
// There are only two specific cases where we need to use the pre-iterator
|
|
// executor:
|
|
//
|
|
// 1. We have a child query expression that needs to run in it.
|
|
// 2. We have join buffering (BNL with non equi-join condition/BKA).
|
|
//
|
|
// If either #1 or #2 is detected, revert to the pre-iterator executor.
|
|
for (unsigned table_idx = const_tables; table_idx < tables; ++table_idx) {
|
|
QEP_TAB *qep_tab = &this->qep_tab[table_idx];
|
|
if (qep_tab->materialize_table == join_materialize_derived) {
|
|
// If we have a derived table that can be processed by
|
|
// the iterator executor, MaterializeIterator can deal with it.
|
|
SELECT_LEX_UNIT *unit = qep_tab->table_ref->derived_unit();
|
|
if (unit->root_iterator() == nullptr &&
|
|
!unit->unfinished_materialization()) {
|
|
// Runs in the pre-iterator executor.
|
|
return nullptr;
|
|
}
|
|
}
|
|
if (qep_tab->next_select == sub_select_op) {
|
|
QEP_operation *op = qep_tab[1].op;
|
|
if (op->type() != QEP_operation::OT_TMP_TABLE) {
|
|
// See if it's possible to replace the BNL with a hash join.
|
|
const JOIN_CACHE *join_cache = down_cast<const JOIN_CACHE *>(op);
|
|
if (!join_cache->can_be_replaced_with_hash_join()) {
|
|
return nullptr;
|
|
}
|
|
} else {
|
|
DBUG_ASSERT(op->type() == QEP_operation::OT_TMP_TABLE);
|
|
QEP_tmp_table *tmp_op = down_cast<QEP_tmp_table *>(op);
|
|
if (tmp_op->get_write_func() == end_write) {
|
|
DBUG_ASSERT(need_tmp_before_win);
|
|
final_materializations.push_back(MaterializeOperation{
|
|
qep_tab + 1, MaterializeOperation::MATERIALIZE});
|
|
} else if (tmp_op->get_write_func() == end_write_group) {
|
|
final_materializations.push_back(MaterializeOperation{
|
|
qep_tab + 1, MaterializeOperation::AGGREGATE_THEN_MATERIALIZE});
|
|
} else if (tmp_op->get_write_func() == end_update) {
|
|
final_materializations.push_back(MaterializeOperation{
|
|
qep_tab + 1, MaterializeOperation::AGGREGATE_INTO_TMP_TABLE});
|
|
} else if (tmp_op->get_write_func() == end_write_wf) {
|
|
final_materializations.push_back(MaterializeOperation{
|
|
qep_tab + 1, MaterializeOperation::WINDOWING_FUNCTION});
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// OK, so we're good. Go through the tables and make the join iterators.
|
|
for (unsigned table_idx = const_tables; table_idx < tables; ++table_idx) {
|
|
QEP_TAB *qep_tab = &this->qep_tab[table_idx];
|
|
if (qep_tab->position() == nullptr) {
|
|
continue;
|
|
}
|
|
|
|
// We don't use these in the iterator executor (except for figuring out
|
|
// which conditions are join conditions and which are from WHERE),
|
|
// so we remove them whenever we can. However, we don't prune them
|
|
// entirely from the query tree, so they may be left within e.g.
|
|
// e.g. sub-conditions of ORs. Open up the conditions so that we don't
|
|
// have conditions that are disabled during execution.
|
|
qep_tab->not_null_compl = true;
|
|
qep_tab->found = true;
|
|
}
|
|
|
|
unique_ptr_destroy_only<RowIterator> iterator;
|
|
if (const_tables == primary_tables) {
|
|
// Only const tables, so add a fake single row to join in all
|
|
// the const tables (only inner-joined tables are promoted to
|
|
// const tables in the optimizer).
|
|
iterator = NewIterator<FakeSingleRowIterator>(thd, &examined_rows);
|
|
if (where_cond != nullptr) {
|
|
iterator = PossiblyAttachFilterIterator(move(iterator),
|
|
vector<Item *>{where_cond}, thd);
|
|
}
|
|
|
|
// Surprisingly enough, we can specify that the const tables are
|
|
// to be dumped immediately to a temporary table. If we don't do this,
|
|
// we risk that there are fields that are not copied correctly
|
|
// (tmp_table_param contains copy_funcs we'd otherwise miss).
|
|
if (const_tables > 0) {
|
|
QEP_TAB *qep_tab = &this->qep_tab[const_tables];
|
|
if (qep_tab[-1].next_select == sub_select_op) {
|
|
// We don't support join buffering, but we do support temporary tables.
|
|
QEP_operation *op = qep_tab->op;
|
|
if (op->type() != QEP_operation::OT_TMP_TABLE) {
|
|
return nullptr;
|
|
}
|
|
DBUG_ASSERT(down_cast<QEP_tmp_table *>(op)->get_write_func() ==
|
|
end_write);
|
|
qep_tab->iterator.reset();
|
|
join_setup_iterator(qep_tab);
|
|
qep_tab->table()->alias = "<temporary>";
|
|
iterator = NewIterator<MaterializeIterator>(
|
|
thd, move(iterator), qep_tab->tmp_table_param, qep_tab->table(),
|
|
move(qep_tab->iterator), /*cte=*/nullptr, select_lex->select_number,
|
|
unit, this, qep_tab->ref_item_slice,
|
|
/*copy_fields_and_items=*/true,
|
|
/*rematerialize=*/true,
|
|
qep_tab->tmp_table_param->end_write_records);
|
|
}
|
|
}
|
|
} else {
|
|
vector<QEP_TAB *> unhandled_duplicates;
|
|
iterator = ConnectJoins(const_tables, primary_tables, qep_tab, thd,
|
|
TOP_LEVEL, nullptr, nullptr, &unhandled_duplicates);
|
|
|
|
// If there were any weedouts that we had to drop during ConnectJoins()
|
|
// (ie., the join left some tables that were supposed to be deduplicated
|
|
// but were not), handle them now at the very end.
|
|
if (!unhandled_duplicates.empty()) {
|
|
iterator = CreateWeedoutIteratorForTables(
|
|
thd, unhandled_duplicates, qep_tab, primary_tables, move(iterator));
|
|
}
|
|
}
|
|
|
|
// Deal with any materialization happening at the end (typically for sorting,
|
|
// grouping or distinct).
|
|
for (MaterializeOperation materialize_op : final_materializations) {
|
|
QEP_TAB *qep_tab = materialize_op.temporary_qep_tab;
|
|
|
|
if (materialize_op.type ==
|
|
MaterializeOperation::AGGREGATE_THEN_MATERIALIZE) {
|
|
// Aggregate as we go, with output into a temporary table.
|
|
// (We can also aggregate as we go after the materialization step;
|
|
// see below. We won't be aggregating twice, though.)
|
|
if (qep_tab->tmp_table_param->precomputed_group_by) {
|
|
DBUG_ASSERT(rollup.state == ROLLUP::STATE_NONE);
|
|
iterator = NewIterator<PrecomputedAggregateIterator>(
|
|
thd, move(iterator), this, qep_tab->tmp_table_param,
|
|
qep_tab->ref_item_slice);
|
|
} else {
|
|
iterator = NewIterator<AggregateIterator>(
|
|
thd, move(iterator), this, qep_tab->tmp_table_param,
|
|
qep_tab->ref_item_slice, rollup.state != ROLLUP::STATE_NONE);
|
|
}
|
|
}
|
|
|
|
// Attach HAVING if needed (it's put on the QEP_TAB and not on the JOIN if
|
|
// we have a temporary table) and we've done all aggregation.
|
|
//
|
|
// FIXME: If the HAVING condition is an alias (a MySQL-specific extension),
|
|
// it could be evaluated twice; once for the condition, and again for the
|
|
// copying into the table. This was originally partially fixed by moving
|
|
// the HAVING into qep_tab->condition() instead, although this makes the
|
|
// temporary table larger than it needs to be, and is not a legal case in
|
|
// the presence of SELECT DISTINCT. (The main.having test has a few tests
|
|
// for this.) Later, it was completely fixed for the old executor,
|
|
// by evaluating the filter against the temporary table row (switching
|
|
// slices), although the conditional move into qep_tab->condition(),
|
|
// which was obsolete for the old executor after said fix, was never
|
|
// removed. See if we can get this fixed in the new executor as well,
|
|
// and then remove the code that moves HAVING onto qep_tab->condition().
|
|
if (qep_tab->having != nullptr &&
|
|
materialize_op.type != MaterializeOperation::AGGREGATE_INTO_TMP_TABLE) {
|
|
iterator =
|
|
NewIterator<FilterIterator>(thd, move(iterator), qep_tab->having);
|
|
}
|
|
|
|
// Sorting comes after the materialization (which we're about to add),
|
|
// and should be shown as such. Prevent join_setup_iterator
|
|
// from adding it to the result iterator; we'll add it ourselves below.
|
|
//
|
|
// Note that this would break the query if run by the old executor!
|
|
Filesort *filesort = qep_tab->filesort;
|
|
qep_tab->filesort = nullptr;
|
|
|
|
qep_tab->iterator.reset();
|
|
join_setup_iterator(qep_tab);
|
|
|
|
qep_tab->table()->alias = "<temporary>";
|
|
|
|
if (materialize_op.type == MaterializeOperation::WINDOWING_FUNCTION) {
|
|
if (qep_tab->tmp_table_param->m_window->needs_buffering()) {
|
|
iterator = NewIterator<BufferingWindowingIterator>(
|
|
thd, move(iterator), qep_tab->tmp_table_param, this,
|
|
qep_tab->ref_item_slice);
|
|
} else {
|
|
iterator = NewIterator<WindowingIterator>(
|
|
thd, move(iterator), qep_tab->tmp_table_param, this,
|
|
qep_tab->ref_item_slice);
|
|
}
|
|
if (!qep_tab->tmp_table_param->m_window_short_circuit) {
|
|
iterator = NewIterator<MaterializeIterator>(
|
|
thd, move(iterator), qep_tab->tmp_table_param, qep_tab->table(),
|
|
move(qep_tab->iterator), /*cte=*/nullptr, select_lex->select_number,
|
|
unit, this,
|
|
/*ref_slice=*/-1, /*copy_fields_and_items_in_materialize=*/false,
|
|
qep_tab->rematerialize, tmp_table_param.end_write_records);
|
|
}
|
|
} else if (materialize_op.type ==
|
|
MaterializeOperation::AGGREGATE_INTO_TMP_TABLE) {
|
|
iterator = NewIterator<TemptableAggregateIterator>(
|
|
thd, move(iterator), qep_tab->tmp_table_param, qep_tab->table(),
|
|
move(qep_tab->iterator), select_lex, this, qep_tab->ref_item_slice);
|
|
if (qep_tab->having != nullptr) {
|
|
iterator =
|
|
NewIterator<FilterIterator>(thd, move(iterator), qep_tab->having);
|
|
}
|
|
} else {
|
|
// MATERIALIZE or AGGREGATE_THEN_MATERIALIZE.
|
|
bool copy_fields_and_items =
|
|
(materialize_op.type !=
|
|
MaterializeOperation::AGGREGATE_THEN_MATERIALIZE);
|
|
|
|
// If we don't need the row IDs, and don't have some sort of deduplication
|
|
// (e.g. for GROUP BY), filesort can take in the data directly, without
|
|
// going through a temporary table.
|
|
//
|
|
// TODO: If the sort order is suitable (or extendable), we could take over
|
|
// the deduplicating responsibilities of the temporary table and activate
|
|
// this mode even if qep_tab->temporary_table_deduplicates() is set.
|
|
if (filesort != nullptr && filesort->using_addon_fields() &&
|
|
!qep_tab->temporary_table_deduplicates()) {
|
|
iterator = NewIterator<StreamingIterator>(
|
|
thd, move(iterator), qep_tab->tmp_table_param, qep_tab->table(),
|
|
copy_fields_and_items);
|
|
} else {
|
|
iterator = NewIterator<MaterializeIterator>(
|
|
thd, move(iterator), qep_tab->tmp_table_param, qep_tab->table(),
|
|
move(qep_tab->iterator), /*cte=*/nullptr, select_lex->select_number,
|
|
unit, this, qep_tab->ref_item_slice, copy_fields_and_items,
|
|
/*rematerialize=*/true,
|
|
qep_tab->tmp_table_param->end_write_records);
|
|
}
|
|
|
|
// NOTE: There's no need to call join->add_materialize_iterator(),
|
|
// as this iterator always rematerializes anyway.
|
|
}
|
|
|
|
if (qep_tab->condition() != nullptr) {
|
|
iterator = NewIterator<FilterIterator>(thd, move(iterator),
|
|
qep_tab->condition());
|
|
qep_tab->mark_condition_as_pushed_to_sort();
|
|
}
|
|
|
|
// The pre-iterator executor does duplicate removal by going into the
|
|
// temporary table and actually deleting records, using a hash table for
|
|
// smaller tables and an O(n²) algorithm for large tables. This kind of
|
|
// deletion is not cleanly representable in the iterator model, so we do it
|
|
// using a duplicate-removing filesort instead, which has a straight-up
|
|
// O(n log n) cost.
|
|
if (qep_tab->needs_duplicate_removal) {
|
|
bool all_order_fields_used;
|
|
ORDER *order = create_order_from_distinct(
|
|
thd, ref_items[qep_tab->ref_item_slice], this->order, fields_list,
|
|
/*skip_aggregates=*/false, /*convert_bit_fields_to_long=*/false,
|
|
&all_order_fields_used);
|
|
if (order == nullptr) {
|
|
// Only const fields.
|
|
iterator = NewIterator<LimitOffsetIterator>(
|
|
thd, move(iterator), /*select_limit_cnt=*/1, /*offset_limit_cnt=*/0,
|
|
/*count_all_rows=*/false, /*skipped_rows=*/nullptr);
|
|
} else {
|
|
bool force_sort_positions = false;
|
|
if (all_order_fields_used) {
|
|
// The ordering for DISTINCT already gave us the right sort order,
|
|
// so no need to sort again.
|
|
filesort = nullptr;
|
|
} else if (filesort != nullptr && !filesort->using_addon_fields()) {
|
|
// We have the rather unusual situation here that we have two sorts
|
|
// directly after each other, with no temporary table in-between,
|
|
// and filesort expects to be able to refer to rows by their position.
|
|
// Usually, the sort for DISTINCT would be a superset of the sort for
|
|
// ORDER BY, but not always (e.g. when sorting by some expression),
|
|
// so we could end up in a situation where the first sort is by addon
|
|
// fields and the second one is by positions.
|
|
//
|
|
// Thus, in this case, we force the first sort to be by positions,
|
|
// so that the result comes from SortFileIndirectIterator or
|
|
// SortBufferIndirectIterator. These will both position the cursor
|
|
// on the underlying temporary table correctly before returning it,
|
|
// so that the successive filesort will save the right position
|
|
// for the row.
|
|
force_sort_positions = true;
|
|
}
|
|
|
|
Filesort *dup_filesort = new (thd->mem_root) Filesort(
|
|
thd, qep_tab, order, HA_POS_ERROR, /*force_stable_sort=*/false,
|
|
/*remove_duplicates=*/true, force_sort_positions);
|
|
iterator = NewIterator<SortingIterator>(thd, dup_filesort,
|
|
move(iterator), &examined_rows);
|
|
qep_tab->table()->duplicate_removal_iterator =
|
|
down_cast<SortingIterator *>(iterator->real_iterator());
|
|
}
|
|
}
|
|
|
|
if (filesort != nullptr) {
|
|
iterator = NewIterator<SortingIterator>(thd, filesort, move(iterator),
|
|
&examined_rows);
|
|
qep_tab->table()->sorting_iterator =
|
|
down_cast<SortingIterator *>(iterator->real_iterator());
|
|
}
|
|
}
|
|
|
|
// See if we need to aggregate data in the final step. Note that we can
|
|
// _not_ rely on streaming_aggregation, as it can be changed from false
|
|
// to true during optimization, and depending on when it was set, it could
|
|
// either mean to aggregate into a temporary table or aggregate on final
|
|
// send.
|
|
bool do_aggregate;
|
|
if (primary_tables == 0 && tmp_tables == 0) {
|
|
// We can't check qep_tab since there's no table, but in this specific case,
|
|
// it is safe to call get_end_select_func() at this point.
|
|
do_aggregate = (get_end_select_func() == end_send_group);
|
|
} else {
|
|
// Note that tmp_table_param.precomputed_group_by can be set even if we
|
|
// don't actually have any grouping (e.g., make_tmp_tables_info() does this
|
|
// even if there are no temporary tables made).
|
|
do_aggregate = (qep_tab[primary_tables + tmp_tables - 1].next_select ==
|
|
end_send_group) ||
|
|
((grouped || group_optimized_away) &&
|
|
tmp_table_param.precomputed_group_by);
|
|
}
|
|
if (do_aggregate) {
|
|
// Aggregate as we go, with output into a special slice of the same table.
|
|
DBUG_ASSERT(streaming_aggregation || tmp_table_param.precomputed_group_by);
|
|
#ifndef DBUG_OFF
|
|
for (MaterializeOperation materialize_op : final_materializations) {
|
|
DBUG_ASSERT(materialize_op.type !=
|
|
MaterializeOperation::AGGREGATE_THEN_MATERIALIZE);
|
|
}
|
|
#endif
|
|
if (tmp_table_param.precomputed_group_by) {
|
|
iterator = NewIterator<PrecomputedAggregateIterator>(
|
|
thd, move(iterator), this, &tmp_table_param,
|
|
REF_SLICE_ORDERED_GROUP_BY);
|
|
DBUG_ASSERT(rollup.state == ROLLUP::STATE_NONE);
|
|
} else {
|
|
iterator = NewIterator<AggregateIterator>(
|
|
thd, move(iterator), this, &tmp_table_param,
|
|
REF_SLICE_ORDERED_GROUP_BY, rollup.state != ROLLUP::STATE_NONE);
|
|
}
|
|
}
|
|
|
|
return iterator;
|
|
}
|
|
|
|
unique_ptr_destroy_only<RowIterator>
|
|
JOIN::attach_iterators_for_having_and_limit(
|
|
unique_ptr_destroy_only<RowIterator> iterator) {
|
|
// Attach HAVING and LIMIT if needed.
|
|
// NOTE: We can have HAVING even without GROUP BY, although it's not very
|
|
// useful.
|
|
if (having_cond != nullptr) {
|
|
iterator = NewIterator<FilterIterator>(thd, move(iterator), having_cond);
|
|
}
|
|
|
|
// Note: For select_count, LIMIT 0 is handled in JOIN::optimize() for the
|
|
// common case, but not for CALC_FOUND_ROWS. OFFSET also isn't handled there.
|
|
if (unit->select_limit_cnt != HA_POS_ERROR || unit->offset_limit_cnt != 0) {
|
|
iterator = NewIterator<LimitOffsetIterator>(
|
|
thd, move(iterator), unit->select_limit_cnt, unit->offset_limit_cnt,
|
|
calc_found_rows, &send_records);
|
|
}
|
|
|
|
return iterator;
|
|
}
|
|
|
|
// Used only in the specific, odd case of a UNION between a non-iterator
|
|
// and an iterator query block.
|
|
static int ExecuteIteratorQuery(JOIN *join) {
|
|
// The outermost LimitOffsetIterator, if any, will increment send_records for
|
|
// each record skipped by OFFSET. This is needed because LIMIT 50 OFFSET 10
|
|
// with no SQL_CALC_FOUND_ROWS is defined to return 60, not 50 (even though
|
|
// it's not necessarily the most useful definition).
|
|
join->send_records = 0;
|
|
|
|
join->thd->get_stmt_da()->reset_current_row_for_condition();
|
|
if (join->root_iterator()->Init()) {
|
|
return 1;
|
|
}
|
|
|
|
PFSBatchMode pfs_batch_mode(join->root_iterator());
|
|
for (;;) {
|
|
int error = join->root_iterator()->Read();
|
|
|
|
DBUG_EXECUTE_IF("bug13822652_1", join->thd->killed = THD::KILL_QUERY;);
|
|
|
|
if (error > 0 || (join->thd->is_error())) // Fatal error
|
|
return 1;
|
|
else if (error < 0)
|
|
break;
|
|
else if (join->thd->killed) // Aborted by user
|
|
{
|
|
join->thd->send_kill_message();
|
|
return -1;
|
|
}
|
|
|
|
++join->send_records;
|
|
|
|
if (join->select_lex->query_result()->send_data(join->thd, *join->fields)) {
|
|
return 1;
|
|
}
|
|
join->thd->get_stmt_da()->inc_current_row_for_condition();
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
Make a join of all tables and write it on socket or to table.
|
|
|
|
@retval
|
|
0 if ok
|
|
@retval
|
|
1 if error is sent
|
|
@retval
|
|
-1 if error should be sent
|
|
*/
|
|
|
|
static int do_select(JOIN *join) {
|
|
int rc = 0;
|
|
enum_nested_loop_state error = NESTED_LOOP_OK;
|
|
DBUG_TRACE;
|
|
|
|
join->send_records = 0;
|
|
THD *thd = join->thd;
|
|
|
|
if (join->root_iterator() != nullptr) {
|
|
error =
|
|
ExecuteIteratorQuery(join) == 0 ? NESTED_LOOP_OK : NESTED_LOOP_ERROR;
|
|
} else if (join->select_count) {
|
|
QEP_TAB *qep_tab = join->qep_tab;
|
|
error = end_send_count(join, qep_tab);
|
|
} else if (join->plan_is_const() && !join->need_tmp_before_win) {
|
|
// Special code for dealing with queries that don't need to
|
|
// read any tables.
|
|
|
|
Next_select_func end_select = join->get_end_select_func();
|
|
/*
|
|
HAVING will be checked after processing aggregate functions,
|
|
But WHERE should checkd here (we alredy have read tables)
|
|
|
|
@todo: consider calling end_select instead of duplicating code
|
|
*/
|
|
if (!join->where_cond || join->where_cond->val_int()) {
|
|
// HAVING will be checked by end_select
|
|
error = (*end_select)(join, 0, 0);
|
|
if (error >= NESTED_LOOP_OK) error = (*end_select)(join, 0, 1);
|
|
|
|
// This is a special case because const-only plans don't go through
|
|
// iterators, which would normally be responsible for incrementing
|
|
// examined_rows.
|
|
join->examined_rows++;
|
|
DBUG_ASSERT(join->examined_rows <= 1);
|
|
} else if (join->send_row_on_empty_set()) {
|
|
table_map save_nullinfo = 0;
|
|
|
|
// Calculate aggregate functions for no rows
|
|
for (Item &item : *join->fields) {
|
|
item.no_rows_in_result();
|
|
}
|
|
|
|
/*
|
|
Mark tables as containing only NULL values for processing
|
|
the HAVING clause and for send_data().
|
|
Calculate a set of tables for which NULL values need to be restored
|
|
after sending data.
|
|
*/
|
|
if (join->clear_fields(&save_nullinfo))
|
|
error = NESTED_LOOP_ERROR;
|
|
else {
|
|
if (having_is_true(join->having_cond) &&
|
|
join->should_send_current_row())
|
|
rc = join->select_lex->query_result()->send_data(thd, *join->fields);
|
|
|
|
// Restore NULL values if needed.
|
|
if (save_nullinfo) join->restore_fields(save_nullinfo);
|
|
}
|
|
}
|
|
/*
|
|
An error can happen when evaluating the conds
|
|
(the join condition and piece of where clause
|
|
relevant to this join table).
|
|
*/
|
|
if (thd->is_error()) error = NESTED_LOOP_ERROR;
|
|
} else {
|
|
// Pre-iterator query execution path.
|
|
DBUG_ASSERT(join->primary_tables);
|
|
|
|
QEP_TAB *qep_tab = join->qep_tab + join->const_tables;
|
|
error = join->first_select(join, qep_tab, 0);
|
|
if (error >= NESTED_LOOP_OK) error = join->first_select(join, qep_tab, 1);
|
|
}
|
|
|
|
thd->current_found_rows = join->send_records;
|
|
/*
|
|
For "order by with limit", we cannot rely on send_records, but need
|
|
to use the rowcount read originally into the join_tab applying the
|
|
filesort. There cannot be any post-filtering conditions, nor any
|
|
following join_tabs in this case, so this rowcount properly represents
|
|
the correct number of qualifying rows.
|
|
*/
|
|
if (join->qep_tab && join->order) {
|
|
// Save # of found records prior to cleanup
|
|
QEP_TAB *sort_tab;
|
|
uint const_tables = join->const_tables;
|
|
|
|
// Take record count from first non constant table or from last tmp table
|
|
if (join->tmp_tables > 0)
|
|
sort_tab = &join->qep_tab[join->primary_tables + join->tmp_tables - 1];
|
|
else {
|
|
DBUG_ASSERT(!join->plan_is_const());
|
|
sort_tab = &join->qep_tab[const_tables];
|
|
}
|
|
if (sort_tab->filesort && join->calc_found_rows &&
|
|
sort_tab->filesort->sortorder &&
|
|
sort_tab->filesort->limit != HA_POS_ERROR) {
|
|
thd->current_found_rows = sort_tab->records();
|
|
}
|
|
}
|
|
|
|
if (error != NESTED_LOOP_OK) rc = -1;
|
|
|
|
if (!join->select_lex->is_recursive() ||
|
|
join->select_lex->master_unit()->got_all_recursive_rows) {
|
|
/*
|
|
The following will unlock all cursors if the command wasn't an
|
|
update command
|
|
*/
|
|
join->join_free(); // Unlock all cursors
|
|
if (error == NESTED_LOOP_OK) {
|
|
/*
|
|
Sic: this branch works even if rc != 0, e.g. when
|
|
send_data above returns an error.
|
|
*/
|
|
if (join->select_lex->query_result()->send_eof(thd))
|
|
rc = 1; // Don't send error
|
|
DBUG_PRINT("info", ("%ld records output", (long)join->send_records));
|
|
}
|
|
}
|
|
|
|
rc = thd->is_error() ? -1 : rc;
|
|
#ifndef DBUG_OFF
|
|
if (rc) {
|
|
DBUG_PRINT("error", ("Error: do_select() failed"));
|
|
}
|
|
#endif
|
|
return rc;
|
|
}
|
|
|
|
/**
|
|
@brief Accumulate full or partial join result in operation and send
|
|
operation's result further.
|
|
|
|
@param join pointer to the structure providing all context info for the query
|
|
@param qep_tab the QEP_TAB object to which the operation is attached
|
|
@param end_of_records true <=> all records were accumulated, send them
|
|
further
|
|
|
|
@details
|
|
This function accumulates records, one by one, in QEP operation's buffer by
|
|
calling op->put_record(). When there is no more records to save, in this
|
|
case the end_of_records argument == true, function tells QEP operation to
|
|
send records further by calling op->send_records().
|
|
When all records are sent this function passes 'end_of_records' signal
|
|
further by calling sub_select() with end_of_records argument set to
|
|
true. After that op->end_send() is called to tell QEP operation that
|
|
it could end internal buffer scan.
|
|
|
|
@note
|
|
This function is not expected to be called when dynamic range scan is
|
|
used to scan join_tab because join cache is disabled for such scan
|
|
and range scans aren't used for tmp tables.
|
|
@see setup_join_buffering
|
|
For caches the function implements the algorithmic schema for both
|
|
Blocked Nested Loop Join and Batched Key Access Join. The difference can
|
|
be seen only at the level of of the implementation of the put_record and
|
|
send_records virtual methods for the cache object associated with the
|
|
join_tab.
|
|
|
|
@return
|
|
return one of enum_nested_loop_state.
|
|
*/
|
|
|
|
enum_nested_loop_state sub_select_op(JOIN *join, QEP_TAB *qep_tab,
|
|
bool end_of_records) {
|
|
DBUG_TRACE;
|
|
|
|
if (join->thd->killed) {
|
|
/* The user has aborted the execution of the query */
|
|
join->thd->send_kill_message();
|
|
return NESTED_LOOP_KILLED;
|
|
}
|
|
|
|
enum_nested_loop_state rc;
|
|
QEP_operation *op = qep_tab->op;
|
|
|
|
/* This function cannot be called if qep_tab has no associated operation */
|
|
DBUG_ASSERT(op != NULL);
|
|
if (end_of_records) {
|
|
rc = op->end_send();
|
|
if (rc >= NESTED_LOOP_OK) rc = sub_select(join, qep_tab, end_of_records);
|
|
return rc;
|
|
}
|
|
if (qep_tab->prepare_scan()) return NESTED_LOOP_ERROR;
|
|
|
|
/*
|
|
setup_join_buffering() disables join buffering if QS_DYNAMIC_RANGE is
|
|
enabled.
|
|
*/
|
|
DBUG_ASSERT(!qep_tab->dynamic_range());
|
|
|
|
rc = op->put_record();
|
|
|
|
return rc;
|
|
}
|
|
|
|
/**
|
|
Retrieve records ends with a given beginning from the result of a join.
|
|
|
|
For a given partial join record consisting of records from the tables
|
|
preceding the table join_tab in the execution plan, the function
|
|
retrieves all matching full records from the result set and
|
|
send them to the result set stream.
|
|
|
|
@note
|
|
The function effectively implements the final (n-k) nested loops
|
|
of nested loops join algorithm, where k is the ordinal number of
|
|
the join_tab table and n is the total number of tables in the join query.
|
|
It performs nested loops joins with all conjunctive predicates from
|
|
the where condition pushed as low to the tables as possible.
|
|
E.g. for the query
|
|
@code
|
|
SELECT * FROM t1,t2,t3
|
|
WHERE t1.a=t2.a AND t2.b=t3.b AND t1.a BETWEEN 5 AND 9
|
|
@endcode
|
|
the predicate (t1.a BETWEEN 5 AND 9) will be pushed to table t1,
|
|
given the selected plan prescribes to nest retrievals of the
|
|
joined tables in the following order: t1,t2,t3.
|
|
A pushed down predicate are attached to the table which it pushed to,
|
|
at the field join_tab->cond.
|
|
When executing a nested loop of level k the function runs through
|
|
the rows of 'join_tab' and for each row checks the pushed condition
|
|
attached to the table.
|
|
If it is false the function moves to the next row of the
|
|
table. If the condition is true the function recursively executes (n-k-1)
|
|
remaining embedded nested loops.
|
|
The situation becomes more complicated if outer joins are involved in
|
|
the execution plan. In this case the pushed down predicates can be
|
|
checked only at certain conditions.
|
|
Suppose for the query
|
|
@code
|
|
SELECT * FROM t1 LEFT JOIN (t2,t3) ON t3.a=t1.a
|
|
WHERE t1>2 AND (t2.b>5 OR t2.b IS NULL)
|
|
@endcode
|
|
the optimizer has chosen a plan with the table order t1,t2,t3.
|
|
The predicate P1=t1>2 will be pushed down to the table t1, while the
|
|
predicate P2=(t2.b>5 OR t2.b IS NULL) will be attached to the table
|
|
t2. But the second predicate can not be unconditionally tested right
|
|
after a row from t2 has been read. This can be done only after the
|
|
first row with t3.a=t1.a has been encountered.
|
|
Thus, the second predicate P2 is supplied with a guarded value that are
|
|
stored in the field 'found' of the first inner table for the outer join
|
|
(table t2). When the first row with t3.a=t1.a for the current row
|
|
of table t1 appears, the value becomes true. For now on the predicate
|
|
is evaluated immediately after the row of table t2 has been read.
|
|
When the first row with t3.a=t1.a has been encountered all
|
|
conditions attached to the inner tables t2,t3 must be evaluated.
|
|
Only when all of them are true the row is sent to the output stream.
|
|
If not, the function returns to the lowest nest level that has a false
|
|
attached condition.
|
|
The predicates from on expressions are also pushed down. If in the
|
|
the above example the on expression were (t3.a=t1.a AND t2.a=t1.a),
|
|
then t1.a=t2.a would be pushed down to table t2, and without any
|
|
guard.
|
|
If after the run through all rows of table t2, the first inner table
|
|
for the outer join operation, it turns out that no matches are
|
|
found for the current row of t1, then current row from table t1
|
|
is complemented by nulls for t2 and t3. Then the pushed down predicates
|
|
are checked for the composed row almost in the same way as it had
|
|
been done for the first row with a match. The only difference is
|
|
the predicates from on expressions are not checked.
|
|
|
|
@par
|
|
@b IMPLEMENTATION
|
|
@par
|
|
The function forms output rows for a current partial join of k
|
|
tables tables recursively.
|
|
For each partial join record ending with a certain row from
|
|
join_tab it calls sub_select that builds all possible matching
|
|
tails from the result set.
|
|
To be able check predicates conditionally items of the class
|
|
Item_func_trig_cond are employed.
|
|
An object of this class is constructed from an item of class COND
|
|
and a pointer to a guarding boolean variable.
|
|
When the value of the guard variable is true the value of the object
|
|
is the same as the value of the predicate, otherwise it's just returns
|
|
true.
|
|
|
|
Testing predicates at the optimal time can be tricky, especially for
|
|
outer joins. Consider the following query:
|
|
|
|
@code
|
|
SELECT * FROM t1
|
|
LEFT JOIN
|
|
(t2 JOIN t3 ON t2.a=t3.a)
|
|
ON t1.a=t2.a
|
|
WHERE t2.b=5 OR t2.b IS NULL
|
|
@endcode
|
|
|
|
(The OR ... IS NULL is solely so that the outer join can not be rewritten
|
|
to an inner join.)
|
|
|
|
Suppose the chosen execution plan dictates the order t1,t2,t3,
|
|
and suppose that we have found a row t1 and are scanning t2.
|
|
We cannot filter rows from t2 as we see them, as the LEFT JOIN needs
|
|
to know that there existed at least one (t2,t3) tuple matching t1,
|
|
so that it should not synthesize a NULL-complemented row.
|
|
|
|
However, once we have a matching t3, we can activate the predicate
|
|
(t2.b=5 OR t2.b IS NULL). (Note that it does not refer to t3 at all.)
|
|
If it fails, we should immediately stop scanning t3 and go back to
|
|
scanning t2 (or in general, arbitrarily early), which is done by setting
|
|
the field 'return_tab' of the JOIN.
|
|
|
|
Now consider a similar but more complex case:
|
|
|
|
@code
|
|
SELECT * FROM t1
|
|
LEFT JOIN
|
|
(t2, t3 LEFT JOIN (t4,t5) ON t5.a=t3.a)
|
|
ON t4.a=t2.a
|
|
WHERE (t2.b=5 OR t2.b IS NULL) AND (t4.b=2 OR t4.b IS NULL)
|
|
@endcode
|
|
|
|
In order not to re-evaluate the predicates that were already evaluated
|
|
as attached pushed down predicates, a pointer to the the first
|
|
most inner unmatched table is maintained in join_tab->first_unmatched.
|
|
Thus, when the first row from t5 with t5.a=t3.a is found
|
|
this pointer for t5 is changed from t4 to t2.
|
|
|
|
@par
|
|
@b STRUCTURE @b NOTES
|
|
@par
|
|
join_tab->first_unmatched points always backwards to the first inner
|
|
table of the embedding nested join, if any.
|
|
|
|
@param join pointer to the structure providing all context info for
|
|
the query
|
|
@param qep_tab the first next table of the execution plan to be retrieved
|
|
@param end_of_records true when we need to perform final steps of retreival
|
|
|
|
@return
|
|
return one of enum_nested_loop_state, except NESTED_LOOP_NO_MORE_ROWS.
|
|
*/
|
|
|
|
enum_nested_loop_state sub_select(JOIN *join, QEP_TAB *const qep_tab,
|
|
bool end_of_records) {
|
|
DBUG_TRACE;
|
|
|
|
TABLE *const table = qep_tab->table();
|
|
|
|
/*
|
|
Enable the items which one should use if one wants to evaluate anything
|
|
(e.g. functions in WHERE, HAVING) involving columns of this table.
|
|
*/
|
|
Switch_ref_item_slice slice_switch(join, qep_tab->ref_item_slice);
|
|
|
|
if (end_of_records) {
|
|
enum_nested_loop_state nls =
|
|
(*qep_tab->next_select)(join, qep_tab + 1, end_of_records);
|
|
|
|
return nls;
|
|
}
|
|
|
|
if (qep_tab->prepare_scan()) return NESTED_LOOP_ERROR;
|
|
|
|
if (qep_tab->starts_weedout()) {
|
|
do_sj_reset(qep_tab->flush_weedout_table);
|
|
}
|
|
|
|
const plan_idx qep_tab_idx = qep_tab->idx();
|
|
join->return_tab = qep_tab_idx;
|
|
qep_tab->not_null_compl = true;
|
|
qep_tab->found_match = false;
|
|
|
|
if (qep_tab->last_inner() != NO_PLAN_IDX) {
|
|
/* qep_tab is the first inner table for an outer join operation. */
|
|
|
|
/* Set initial state of guard variables for this table.*/
|
|
qep_tab->found = false;
|
|
|
|
/* Set first_unmatched for the last inner table of this group */
|
|
QEP_AT(qep_tab, last_inner()).first_unmatched = qep_tab_idx;
|
|
}
|
|
if (qep_tab->do_firstmatch() || qep_tab->do_loosescan()) {
|
|
/*
|
|
qep_tab is the first table of a LooseScan range, or has a "jump"
|
|
address in a FirstMatch range.
|
|
Reset the matching for this round of execution.
|
|
*/
|
|
QEP_AT(qep_tab, match_tab).found_match = false;
|
|
}
|
|
|
|
join->thd->get_stmt_da()->reset_current_row_for_condition();
|
|
|
|
enum_nested_loop_state rc = NESTED_LOOP_OK;
|
|
|
|
bool in_first_read = true;
|
|
const bool is_recursive_ref = qep_tab->table_ref->is_recursive_reference();
|
|
// Init these 3 variables even if used only if is_recursive_ref is true.
|
|
const ha_rows *recursive_row_count = nullptr;
|
|
ha_rows recursive_row_count_start = 0;
|
|
bool count_iterations = false;
|
|
|
|
if (is_recursive_ref) {
|
|
// See also Recursive_executor's documentation
|
|
if (join->unit->got_all_recursive_rows) return rc;
|
|
// The recursive CTE algorithm requires a table scan.
|
|
DBUG_ASSERT(qep_tab->type() == JT_ALL);
|
|
in_first_read = !table->file->inited;
|
|
/*
|
|
Tmp table which we're reading is bound to this result, and we'll be
|
|
checking its row count frequently:
|
|
*/
|
|
recursive_row_count =
|
|
join->unit->recursive_result(join->select_lex)->row_count();
|
|
// How many rows we have already read; defines start of iteration.
|
|
recursive_row_count_start = qep_tab->m_fetched_rows;
|
|
// Execution of fake_select_lex doesn't count for the user:
|
|
count_iterations = join->select_lex != join->unit->fake_select_lex;
|
|
}
|
|
|
|
// NOTE: If we are reading from a SortingIterator, it will set up batch mode
|
|
// by itself, so don't activate it here. (It won't be activated when reading
|
|
// the records back, though, only during the sort itself.)
|
|
const bool pfs_batch_update =
|
|
qep_tab->filesort == nullptr && qep_tab->pfs_batch_update(join);
|
|
if (pfs_batch_update) table->file->start_psi_batch_mode();
|
|
|
|
RowIterator *iterator = qep_tab->iterator.get();
|
|
while (rc == NESTED_LOOP_OK && join->return_tab >= qep_tab_idx) {
|
|
int error;
|
|
|
|
if (is_recursive_ref && qep_tab->m_fetched_rows >=
|
|
*recursive_row_count) { // We have read all
|
|
// that's in the tmp
|
|
// table: signal EOF.
|
|
error = -1;
|
|
break;
|
|
}
|
|
|
|
if (in_first_read) {
|
|
in_first_read = false;
|
|
if (iterator->Init()) {
|
|
rc = NESTED_LOOP_ERROR;
|
|
break;
|
|
}
|
|
}
|
|
error = iterator->Read();
|
|
|
|
DBUG_EXECUTE_IF("bug13822652_1", join->thd->killed = THD::KILL_QUERY;);
|
|
|
|
if (error > 0 || (join->thd->is_error())) // Fatal error
|
|
rc = NESTED_LOOP_ERROR;
|
|
else if (error < 0)
|
|
break;
|
|
else if (join->thd->killed) // Aborted by user
|
|
{
|
|
join->thd->send_kill_message();
|
|
rc = NESTED_LOOP_KILLED;
|
|
} else {
|
|
qep_tab->m_fetched_rows++;
|
|
if (is_recursive_ref &&
|
|
qep_tab->m_fetched_rows == recursive_row_count_start + 1) {
|
|
/*
|
|
We have just read one row further than the set of rows of the
|
|
iteration, so we have actually just entered a new iteration.
|
|
*/
|
|
if (count_iterations &&
|
|
++join->recursive_iteration_count >
|
|
join->thd->variables.cte_max_recursion_depth) {
|
|
my_error(ER_CTE_MAX_RECURSION_DEPTH, MYF(0),
|
|
join->recursive_iteration_count);
|
|
rc = NESTED_LOOP_ERROR;
|
|
break;
|
|
}
|
|
// This new iteration sees the rows made by the previous one:
|
|
recursive_row_count_start = *recursive_row_count;
|
|
}
|
|
if (qep_tab->rowid_status == NEED_TO_CALL_POSITION_FOR_ROWID) {
|
|
table->file->position(table->record[0]);
|
|
}
|
|
rc = evaluate_join_record(join, qep_tab);
|
|
}
|
|
}
|
|
|
|
if (rc == NESTED_LOOP_OK && qep_tab->last_inner() != NO_PLAN_IDX &&
|
|
!qep_tab->found)
|
|
rc = evaluate_null_complemented_join_record(join, qep_tab);
|
|
|
|
if (pfs_batch_update) table->file->end_psi_batch_mode();
|
|
|
|
return rc;
|
|
}
|
|
|
|
void QEP_TAB::refresh_lateral() {
|
|
/*
|
|
See if some lateral derived table further down on the execution path,
|
|
depends on us. If so, mark it for rematerialization.
|
|
Note that if this lateral DT depends on only const tables, the function
|
|
does nothing as it's not called for const tables; however, the lateral DT
|
|
is materialized once in its prepare_scan() like for a non-lateral DT.
|
|
todo: could this dependency-map idea be reused to decrease the amount of
|
|
execution for JSON_TABLE too? For now, JSON_TABLE is rematerialized every
|
|
time we're about to read it.
|
|
*/
|
|
JOIN *j = join();
|
|
DBUG_ASSERT(j->has_lateral && lateral_derived_tables_depend_on_me);
|
|
auto deps = lateral_derived_tables_depend_on_me;
|
|
for (QEP_TAB **tab2 = j->map2qep_tab; deps; tab2++, deps >>= 1) {
|
|
if (deps & 1) (*tab2)->rematerialize = true;
|
|
}
|
|
}
|
|
|
|
/**
|
|
@brief Prepare table to be scanned.
|
|
|
|
@details This function is the place to do any work on the table that
|
|
needs to be done before table can be scanned. Currently it
|
|
materializes derived tables and semi-joined subqueries,
|
|
binds buffer for current rowid and removes duplicates if needed.
|
|
|
|
@returns false - Ok, true - error
|
|
*/
|
|
|
|
bool QEP_TAB::prepare_scan() {
|
|
// Check whether materialization is required.
|
|
if (!materialize_table) return false;
|
|
|
|
if (table()->materialized) {
|
|
if (!rematerialize) return false;
|
|
if (table()->empty_result_table()) return true;
|
|
}
|
|
|
|
// Materialize table prior to reading it
|
|
if ((*materialize_table)(this)) return true;
|
|
|
|
if (table_ref && table_ref->is_derived() &&
|
|
table_ref->derived_unit()->m_lateral_deps)
|
|
// no further materialization, unless dependencies change
|
|
rematerialize = false;
|
|
|
|
// Bind to the rowid buffer managed by the TABLE object.
|
|
if (copy_current_rowid) copy_current_rowid->bind_buffer(table()->file->ref);
|
|
|
|
table()->set_not_started();
|
|
|
|
if (needs_duplicate_removal && remove_duplicates()) return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
SemiJoinDuplicateElimination: Weed out duplicate row combinations
|
|
|
|
SYNPOSIS
|
|
do_sj_dups_weedout()
|
|
thd Thread handle
|
|
sjtbl Duplicate weedout table
|
|
|
|
DESCRIPTION
|
|
Try storing current record combination of outer tables (i.e. their
|
|
rowids) in the temporary table. This records the fact that we've seen
|
|
this record combination and also tells us if we've seen it before.
|
|
|
|
RETURN
|
|
-1 Error
|
|
1 The row combination is a duplicate (discard it)
|
|
0 The row combination is not a duplicate (continue)
|
|
*/
|
|
|
|
int do_sj_dups_weedout(THD *thd, SJ_TMP_TABLE *sjtbl) {
|
|
int error;
|
|
SJ_TMP_TABLE::TAB *tab = sjtbl->tabs;
|
|
SJ_TMP_TABLE::TAB *tab_end = sjtbl->tabs_end;
|
|
|
|
DBUG_TRACE;
|
|
|
|
if (sjtbl->is_confluent) {
|
|
if (sjtbl->have_confluent_row)
|
|
return 1;
|
|
else {
|
|
sjtbl->have_confluent_row = true;
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
uchar *ptr = sjtbl->tmp_table->visible_field_ptr()[0]->ptr;
|
|
// Put the rowids tuple into table->record[0]:
|
|
// 1. Store the length
|
|
if (((Field_varstring *)(sjtbl->tmp_table->visible_field_ptr()[0]))
|
|
->length_bytes == 1) {
|
|
*ptr = (uchar)(sjtbl->rowid_len + sjtbl->null_bytes);
|
|
ptr++;
|
|
} else {
|
|
int2store(ptr, sjtbl->rowid_len + sjtbl->null_bytes);
|
|
ptr += 2;
|
|
}
|
|
|
|
// 2. Zero the null bytes
|
|
uchar *const nulls_ptr = ptr;
|
|
if (sjtbl->null_bytes) {
|
|
memset(ptr, 0, sjtbl->null_bytes);
|
|
ptr += sjtbl->null_bytes;
|
|
}
|
|
|
|
// 3. Put the rowids
|
|
for (uint i = 0; tab != tab_end; tab++, i++) {
|
|
handler *h = tab->qep_tab->table()->file;
|
|
if (tab->qep_tab->table()->is_nullable() &&
|
|
tab->qep_tab->table()->has_null_row()) {
|
|
/* It's a NULL-complemented row */
|
|
*(nulls_ptr + tab->null_byte) |= tab->null_bit;
|
|
memset(ptr + tab->rowid_offset, 0, h->ref_length);
|
|
} else {
|
|
/* Copy the rowid value */
|
|
memcpy(ptr + tab->rowid_offset, h->ref, h->ref_length);
|
|
}
|
|
}
|
|
|
|
if (!check_unique_constraint(sjtbl->tmp_table)) return 1;
|
|
error = sjtbl->tmp_table->file->ha_write_row(sjtbl->tmp_table->record[0]);
|
|
if (error) {
|
|
/* If this is a duplicate error, return immediately */
|
|
if (sjtbl->tmp_table->file->is_ignorable_error(error)) return 1;
|
|
/*
|
|
Other error than duplicate error: Attempt to create a temporary table.
|
|
*/
|
|
bool is_duplicate;
|
|
if (create_ondisk_from_heap(thd, sjtbl->tmp_table, error, true,
|
|
&is_duplicate))
|
|
return -1;
|
|
return is_duplicate ? 1 : 0;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
SemiJoinDuplicateElimination: Reset the temporary table
|
|
*/
|
|
|
|
static int do_sj_reset(SJ_TMP_TABLE *sj_tbl) {
|
|
DBUG_TRACE;
|
|
if (sj_tbl->tmp_table) {
|
|
int rc = sj_tbl->tmp_table->empty_result_table();
|
|
if (sj_tbl->tmp_table->hash_field)
|
|
sj_tbl->tmp_table->file->ha_index_init(0, false);
|
|
return rc;
|
|
}
|
|
sj_tbl->have_confluent_row = false;
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
@brief Process one row of the nested loop join.
|
|
|
|
This function will evaluate parts of WHERE/ON clauses that are
|
|
applicable to the partial row on hand and in case of success
|
|
submit this row to the next level of the nested loop.
|
|
join_tab->return_tab may be modified to cause a return to a previous
|
|
join_tab.
|
|
|
|
@param join The join object
|
|
@param qep_tab The most inner qep_tab being processed
|
|
|
|
@return Nested loop state
|
|
*/
|
|
|
|
static enum_nested_loop_state evaluate_join_record(JOIN *join,
|
|
QEP_TAB *const qep_tab) {
|
|
bool not_used_in_distinct = qep_tab->not_used_in_distinct;
|
|
ha_rows found_records = join->found_records;
|
|
Item *condition = qep_tab->condition();
|
|
const plan_idx qep_tab_idx = qep_tab->idx();
|
|
bool found = true;
|
|
DBUG_TRACE;
|
|
DBUG_PRINT("enter", ("join: %p join_tab index: %d table: %s cond: %p", join,
|
|
static_cast<int>(qep_tab_idx), qep_tab->table()->alias,
|
|
condition));
|
|
|
|
if (condition) {
|
|
found = condition->val_int();
|
|
|
|
if (join->thd->killed) {
|
|
join->thd->send_kill_message();
|
|
return NESTED_LOOP_KILLED;
|
|
}
|
|
|
|
/* check for errors evaluating the condition */
|
|
if (join->thd->is_error()) return NESTED_LOOP_ERROR;
|
|
}
|
|
if (found) {
|
|
/*
|
|
There is no condition on this join_tab or the attached pushed down
|
|
condition is true => a match is found.
|
|
*/
|
|
while (qep_tab->first_unmatched != NO_PLAN_IDX && found) {
|
|
/*
|
|
The while condition is always false if join_tab is not
|
|
the last inner join table of an outer join operation.
|
|
*/
|
|
QEP_TAB *first_unmatched = &QEP_AT(qep_tab, first_unmatched);
|
|
/*
|
|
Mark that a match for the current row of the outer table is found.
|
|
This activates WHERE clause predicates attached the inner tables of
|
|
the outer join.
|
|
*/
|
|
first_unmatched->found = true;
|
|
for (QEP_TAB *tab = first_unmatched; tab <= qep_tab; tab++) {
|
|
/*
|
|
Check all predicates that have just been activated.
|
|
|
|
Actually all predicates non-guarded by first_unmatched->found
|
|
will be re-evaluated again. It could be fixed, but, probably,
|
|
it's not worth doing now.
|
|
|
|
not_exists_optimize has been created from a
|
|
condition containing 'is_null'. This 'is_null'
|
|
predicate is still present on any 'tab' with
|
|
'not_exists_optimize'. Furthermore, the usual rules
|
|
for condition guards also applies for
|
|
'not_exists_optimize' -> When 'is_null==false' we
|
|
know all cond. guards are open and we can apply
|
|
the 'not_exists_optimize'.
|
|
*/
|
|
DBUG_ASSERT(
|
|
!(tab->table()->reginfo.not_exists_optimize && !tab->condition()));
|
|
|
|
if (tab->condition() && !tab->condition()->val_int()) {
|
|
/* The condition attached to table tab is false */
|
|
|
|
if (tab->table()->reginfo.not_exists_optimize) {
|
|
/*
|
|
When not_exists_optimizer is set and a matching row is found, the
|
|
outer row should be excluded from the result set: no need to
|
|
explore this record, thus we don't call the next_select.
|
|
And, no need to explore other following records of 'tab', so we
|
|
set join->return_tab.
|
|
As we set join_tab->found above, evaluate_join_record() at the
|
|
upper level will not yield a NULL-complemented record.
|
|
Note that the calculation below can set return_tab to -1
|
|
i.e. PRE_FIRST_PLAN_IDX.
|
|
*/
|
|
join->return_tab = qep_tab_idx - 1;
|
|
return NESTED_LOOP_OK;
|
|
}
|
|
|
|
if (tab == qep_tab)
|
|
found = 0;
|
|
else {
|
|
/*
|
|
Set a return point if rejected predicate is attached
|
|
not to the last table of the current nest level.
|
|
*/
|
|
join->return_tab = tab->idx();
|
|
return NESTED_LOOP_OK;
|
|
}
|
|
}
|
|
/* check for errors evaluating the condition */
|
|
if (join->thd->is_error()) return NESTED_LOOP_ERROR;
|
|
}
|
|
/*
|
|
Check whether join_tab is not the last inner table
|
|
for another embedding outer join.
|
|
*/
|
|
plan_idx f_u = first_unmatched->first_upper();
|
|
if (f_u != NO_PLAN_IDX && join->qep_tab[f_u].last_inner() != qep_tab_idx)
|
|
f_u = NO_PLAN_IDX;
|
|
qep_tab->first_unmatched = f_u;
|
|
}
|
|
|
|
plan_idx return_tab = join->return_tab;
|
|
|
|
if (qep_tab->finishes_weedout() && found) {
|
|
int res = do_sj_dups_weedout(join->thd, qep_tab->check_weed_out_table);
|
|
if (res == -1)
|
|
return NESTED_LOOP_ERROR;
|
|
else if (res == 1)
|
|
found = false;
|
|
} else if (qep_tab->do_loosescan() &&
|
|
QEP_AT(qep_tab, match_tab).found_match) {
|
|
/*
|
|
Loosescan algorithm requires an access method that gives 'sorted'
|
|
retrieval of keys, or an access method that provides only one
|
|
row (which is inherently sorted).
|
|
EQ_REF and LooseScan may happen if dependencies in subquery (e.g.,
|
|
outer join) prevents table pull-out.
|
|
*/
|
|
DBUG_ASSERT(qep_tab->use_order() || qep_tab->type() == JT_EQ_REF);
|
|
|
|
/*
|
|
Previous row combination for duplicate-generating range,
|
|
generated a match. Compare keys of this row and previous row
|
|
to determine if this is a duplicate that should be skipped.
|
|
*/
|
|
if (key_cmp(qep_tab->table()->key_info[qep_tab->index()].key_part,
|
|
qep_tab->loosescan_buf, qep_tab->loosescan_key_len))
|
|
/*
|
|
Keys do not match.
|
|
Reset found_match for last table of duplicate-generating range,
|
|
to avoid comparing keys until a new match has been found.
|
|
*/
|
|
QEP_AT(qep_tab, match_tab).found_match = false;
|
|
else
|
|
found = false;
|
|
}
|
|
|
|
/*
|
|
It was not just a return to lower loop level when one
|
|
of the newly activated predicates is evaluated as false
|
|
(See above join->return_tab= tab).
|
|
*/
|
|
|
|
if (found) {
|
|
enum enum_nested_loop_state rc;
|
|
// A match is found for the current partial join prefix.
|
|
qep_tab->found_match = true;
|
|
if (unlikely(qep_tab->lateral_derived_tables_depend_on_me))
|
|
qep_tab->refresh_lateral();
|
|
|
|
rc = (*qep_tab->next_select)(join, qep_tab + 1, 0);
|
|
|
|
if (rc != NESTED_LOOP_OK) return rc;
|
|
|
|
/* check for errors evaluating the condition */
|
|
if (join->thd->is_error()) return NESTED_LOOP_ERROR;
|
|
|
|
if (qep_tab->do_loosescan() && QEP_AT(qep_tab, match_tab).found_match) {
|
|
/*
|
|
A match was found for a duplicate-generating range of a semijoin.
|
|
Copy key to be able to determine whether subsequent rows
|
|
will give duplicates that should be skipped.
|
|
*/
|
|
KEY *key = qep_tab->table()->key_info + qep_tab->index();
|
|
key_copy(qep_tab->loosescan_buf, qep_tab->table()->record[0], key,
|
|
qep_tab->loosescan_key_len);
|
|
} else if (qep_tab->do_firstmatch() &&
|
|
QEP_AT(qep_tab, match_tab).found_match) {
|
|
/*
|
|
We should return to join_tab->firstmatch_return after we have
|
|
enumerated all the suffixes for current prefix row combination
|
|
*/
|
|
set_if_smaller(return_tab, qep_tab->firstmatch_return);
|
|
}
|
|
|
|
/*
|
|
Test if this was a SELECT DISTINCT query on a table that
|
|
was not in the field list; In this case we can abort if
|
|
we found a row, as no new rows can be added to the result.
|
|
*/
|
|
if (not_used_in_distinct && found_records != join->found_records)
|
|
set_if_smaller(return_tab, qep_tab_idx - 1);
|
|
|
|
set_if_smaller(join->return_tab, return_tab);
|
|
} else {
|
|
if (qep_tab->not_null_compl) {
|
|
/* a NULL-complemented row is not in a table so cannot be locked */
|
|
qep_tab->iterator->UnlockRow();
|
|
}
|
|
}
|
|
} else {
|
|
/*
|
|
The condition pushed down to the table join_tab rejects all rows
|
|
with the beginning coinciding with the current partial join.
|
|
*/
|
|
if (qep_tab->not_null_compl) qep_tab->iterator->UnlockRow();
|
|
}
|
|
return NESTED_LOOP_OK;
|
|
}
|
|
|
|
/**
|
|
|
|
@details
|
|
Construct a NULL complimented partial join record and feed it to the next
|
|
level of the nested loop. This function is used in case we have
|
|
an OUTER join and no matching record was found.
|
|
*/
|
|
|
|
static enum_nested_loop_state evaluate_null_complemented_join_record(
|
|
JOIN *join, QEP_TAB *qep_tab) {
|
|
/*
|
|
The table join_tab is the first inner table of a outer join operation
|
|
and no matches has been found for the current outer row.
|
|
*/
|
|
QEP_TAB *first_inner_tab = qep_tab;
|
|
QEP_TAB *last_inner_tab = &QEP_AT(qep_tab, last_inner());
|
|
|
|
DBUG_TRACE;
|
|
|
|
bool matching = true;
|
|
enum_nested_loop_state rc = NESTED_LOOP_OK;
|
|
|
|
for (; qep_tab <= last_inner_tab; qep_tab++) {
|
|
// Make sure that the rowid buffer is bound, duplicates weedout needs it
|
|
if (qep_tab->copy_current_rowid &&
|
|
!qep_tab->copy_current_rowid->buffer_is_bound())
|
|
qep_tab->copy_current_rowid->bind_buffer(qep_tab->table()->file->ref);
|
|
|
|
/* Change the the values of guard predicate variables. */
|
|
qep_tab->found = true;
|
|
qep_tab->not_null_compl = false;
|
|
// Outer row is complemented by null values for each field from inner tables
|
|
qep_tab->table()->set_null_row();
|
|
if (qep_tab->starts_weedout() && qep_tab > first_inner_tab) {
|
|
// sub_select() has not performed a reset for this table.
|
|
do_sj_reset(qep_tab->flush_weedout_table);
|
|
}
|
|
/* Check all attached conditions for inner table rows. */
|
|
if (qep_tab->condition() && !qep_tab->condition()->val_int()) {
|
|
if (join->thd->killed) {
|
|
join->thd->send_kill_message();
|
|
return NESTED_LOOP_KILLED;
|
|
}
|
|
|
|
/* check for errors */
|
|
if (join->thd->is_error()) return NESTED_LOOP_ERROR;
|
|
|
|
matching = false;
|
|
break;
|
|
}
|
|
}
|
|
if (matching) {
|
|
qep_tab = last_inner_tab;
|
|
/*
|
|
From the point of view of the rest of execution, this record matches
|
|
(it has been built and satisfies conditions, no need to do more evaluation
|
|
on it). See similar code in evaluate_join_record().
|
|
*/
|
|
plan_idx f_u = QEP_AT(qep_tab, first_unmatched).first_upper();
|
|
if (f_u != NO_PLAN_IDX && join->qep_tab[f_u].last_inner() != qep_tab->idx())
|
|
f_u = NO_PLAN_IDX;
|
|
qep_tab->first_unmatched = f_u;
|
|
/*
|
|
The row complemented by nulls satisfies all conditions
|
|
attached to inner tables.
|
|
Finish evaluation of record and send it to be joined with
|
|
remaining tables.
|
|
Note that evaluate_join_record will re-evaluate the condition attached
|
|
to the last inner table of the current outer join. This is not deemed to
|
|
have a significant performance impact.
|
|
*/
|
|
rc = evaluate_join_record(join, qep_tab);
|
|
}
|
|
for (QEP_TAB *tab = first_inner_tab; tab <= last_inner_tab; tab++) {
|
|
tab->table()->reset_null_row();
|
|
// Restore NULL bits saved when reading row, @see EQRefIterator()
|
|
if (tab->type() == JT_EQ_REF) tab->table()->restore_null_flags();
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
/*****************************************************************************
|
|
The different ways to read a record
|
|
Returns -1 if row was not found, 0 if row was found and 1 on errors
|
|
*****************************************************************************/
|
|
|
|
/** Help function when we get some an error from the table handler. */
|
|
|
|
int report_handler_error(TABLE *table, int error) {
|
|
if (error == HA_ERR_END_OF_FILE || error == HA_ERR_KEY_NOT_FOUND) {
|
|
table->set_no_row();
|
|
return -1; // key not found; ok
|
|
}
|
|
/*
|
|
Do not spam the error log with these temporary errors:
|
|
LOCK_DEADLOCK LOCK_WAIT_TIMEOUT TABLE_DEF_CHANGED
|
|
Also skip printing to error log if the current thread has been killed.
|
|
*/
|
|
if (error != HA_ERR_LOCK_DEADLOCK && error != HA_ERR_LOCK_WAIT_TIMEOUT &&
|
|
error != HA_ERR_TABLE_DEF_CHANGED && !table->in_use->killed)
|
|
LogErr(ERROR_LEVEL, ER_READING_TABLE_FAILED, error, table->s->path.str);
|
|
table->file->print_error(error, MYF(0));
|
|
return 1;
|
|
}
|
|
|
|
/**
|
|
Initialize an index scan and the record buffer to use in the scan.
|
|
|
|
@param qep_tab the table to read
|
|
@param file the handler to initialize
|
|
@param idx the index to use
|
|
@param sorted use the sorted order of the index
|
|
@retval true if an error occurred
|
|
@retval false on success
|
|
*/
|
|
static bool init_index_and_record_buffer(const QEP_TAB *qep_tab, handler *file,
|
|
uint idx, bool sorted) {
|
|
if (file->inited) return false; // OK, already initialized
|
|
|
|
int error = file->ha_index_init(idx, sorted);
|
|
if (error != 0) {
|
|
(void)report_handler_error(qep_tab->table(), error);
|
|
return true;
|
|
}
|
|
|
|
return set_record_buffer(qep_tab);
|
|
}
|
|
|
|
int safe_index_read(QEP_TAB *tab) {
|
|
int error;
|
|
TABLE *table = tab->table();
|
|
if ((error = table->file->ha_index_read_map(
|
|
table->record[0], tab->ref().key_buff,
|
|
make_prev_keypart_map(tab->ref().key_parts), HA_READ_KEY_EXACT)))
|
|
return report_handler_error(table, error);
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
Reads content of constant table
|
|
@param tab table
|
|
@param pos position of table in query plan
|
|
@retval 0 ok, one row was found or one NULL-complemented row was created
|
|
@retval -1 ok, no row was found and no NULL-complemented row was created
|
|
@retval 1 error
|
|
*/
|
|
|
|
int join_read_const_table(JOIN_TAB *tab, POSITION *pos) {
|
|
int error;
|
|
DBUG_TRACE;
|
|
TABLE *table = tab->table();
|
|
THD *const thd = tab->join()->thd;
|
|
table->const_table = true;
|
|
DBUG_ASSERT(!thd->is_error());
|
|
|
|
if (table->reginfo.lock_type >= TL_WRITE_ALLOW_WRITE) {
|
|
const enum_sql_command sql_command = tab->join()->thd->lex->sql_command;
|
|
if (sql_command == SQLCOM_UPDATE_MULTI ||
|
|
sql_command == SQLCOM_DELETE_MULTI) {
|
|
/*
|
|
In a multi-UPDATE, if we represent "depends on" with "->", we have:
|
|
"what columns to read (read_set)" ->
|
|
"whether table will be updated on-the-fly or with tmp table" ->
|
|
"whether to-be-updated columns are used by access path"
|
|
"access path to table (range, ref, scan...)" ->
|
|
"query execution plan" ->
|
|
"what tables are const" ->
|
|
"reading const tables" ->
|
|
"what columns to read (read_set)".
|
|
To break this loop, we always read all columns of a constant table if
|
|
it is going to be updated.
|
|
Another case is in multi-UPDATE and multi-DELETE, when the table has a
|
|
trigger: bits of columns needed by the trigger are turned on in
|
|
result->optimize(), which has not yet been called when we do
|
|
the reading now, so we must read all columns.
|
|
*/
|
|
bitmap_set_all(table->read_set);
|
|
/* Virtual generated columns must be writable */
|
|
for (Field **vfield_ptr = table->vfield; vfield_ptr && *vfield_ptr;
|
|
vfield_ptr++)
|
|
bitmap_set_bit(table->write_set, (*vfield_ptr)->field_index);
|
|
table->file->column_bitmaps_signal();
|
|
}
|
|
}
|
|
|
|
if (tab->type() == JT_SYSTEM)
|
|
error = read_system(table);
|
|
else {
|
|
if (!table->key_read && table->covering_keys.is_set(tab->ref().key) &&
|
|
!table->no_keyread &&
|
|
(int)table->reginfo.lock_type <= (int)TL_READ_HIGH_PRIORITY) {
|
|
table->set_keyread(true);
|
|
tab->set_index(tab->ref().key);
|
|
}
|
|
error = read_const(table, &tab->ref());
|
|
table->set_keyread(false);
|
|
}
|
|
|
|
if (error) {
|
|
// Promote error to fatal if an actual error was reported
|
|
if (thd->is_error()) error = 1;
|
|
/* Mark for EXPLAIN that the row was not found */
|
|
pos->filter_effect = 1.0;
|
|
pos->rows_fetched = 0.0;
|
|
pos->prefix_rowcount = 0.0;
|
|
pos->ref_depend_map = 0;
|
|
if (!tab->table_ref->outer_join || error > 0) return error;
|
|
}
|
|
|
|
if (tab->join_cond() && !table->has_null_row()) {
|
|
// We cannot handle outer-joined tables with expensive join conditions here:
|
|
DBUG_ASSERT(!tab->join_cond()->is_expensive());
|
|
if (tab->join_cond()->val_int() == 0) table->set_null_row();
|
|
}
|
|
|
|
/* Check appearance of new constant items in Item_equal objects */
|
|
JOIN *const join = tab->join();
|
|
if (join->where_cond && update_const_equal_items(thd, join->where_cond, tab))
|
|
return 1;
|
|
TABLE_LIST *tbl;
|
|
for (tbl = join->select_lex->leaf_tables; tbl; tbl = tbl->next_leaf) {
|
|
TABLE_LIST *embedded;
|
|
TABLE_LIST *embedding = tbl;
|
|
do {
|
|
embedded = embedding;
|
|
if (embedded->join_cond_optim() &&
|
|
update_const_equal_items(thd, embedded->join_cond_optim(), tab))
|
|
return 1;
|
|
embedding = embedded->embedding;
|
|
} while (embedding && embedding->nested_join->join_list.head() == embedded);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
Read a constant table when there is at most one matching row, using a table
|
|
scan.
|
|
|
|
@param table Table to read
|
|
|
|
@retval 0 Row was found
|
|
@retval -1 Row was not found
|
|
@retval 1 Got an error (other than row not found) during read
|
|
*/
|
|
static int read_system(TABLE *table) {
|
|
int error;
|
|
if (!table->is_started()) // If first read
|
|
{
|
|
if ((error = table->file->ha_read_first_row(table->record[0],
|
|
table->s->primary_key))) {
|
|
if (error != HA_ERR_END_OF_FILE)
|
|
return report_handler_error(table, error);
|
|
table->set_null_row();
|
|
empty_record(table); // Make empty record
|
|
return -1;
|
|
}
|
|
store_record(table, record[1]);
|
|
} else if (table->has_row() && table->is_nullable()) {
|
|
/*
|
|
Row buffer contains a row, but it may have been partially overwritten
|
|
by a null-extended row. Restore the row from the saved copy.
|
|
@note this branch is currently unused.
|
|
*/
|
|
DBUG_ASSERT(false);
|
|
table->set_found_row();
|
|
restore_record(table, record[1]);
|
|
}
|
|
|
|
return table->has_row() ? 0 : -1;
|
|
}
|
|
|
|
ConstIterator::ConstIterator(THD *thd, TABLE *table, TABLE_REF *table_ref,
|
|
ha_rows *examined_rows)
|
|
: TableRowIterator(thd, table),
|
|
m_ref(table_ref),
|
|
m_examined_rows(examined_rows) {}
|
|
|
|
bool ConstIterator::Init() {
|
|
m_first_record_since_init = true;
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
Read a constant table when there is at most one matching row, using an
|
|
index lookup.
|
|
|
|
@retval 0 Row was found
|
|
@retval -1 Row was not found
|
|
@retval 1 Got an error (other than row not found) during read
|
|
*/
|
|
|
|
int ConstIterator::Read() {
|
|
if (!m_first_record_since_init) {
|
|
return -1;
|
|
}
|
|
m_first_record_since_init = false;
|
|
int err = read_const(table(), m_ref);
|
|
if (err == 0 && m_examined_rows != nullptr) {
|
|
++*m_examined_rows;
|
|
}
|
|
table()->const_table = true;
|
|
return err;
|
|
}
|
|
|
|
vector<string> ConstIterator::DebugString() const {
|
|
DBUG_ASSERT(table()->file->pushed_idx_cond == nullptr);
|
|
DBUG_ASSERT(table()->file->pushed_cond == nullptr);
|
|
return {string("Constant row from ") + table()->alias};
|
|
}
|
|
|
|
static int read_const(TABLE *table, TABLE_REF *ref) {
|
|
int error;
|
|
DBUG_TRACE;
|
|
|
|
if (!table->is_started()) // If first read
|
|
{
|
|
/* Perform "Late NULLs Filtering" (see internals manual for explanations) */
|
|
if (ref->impossible_null_ref() ||
|
|
cp_buffer_from_ref(table->in_use, table, ref))
|
|
error = HA_ERR_KEY_NOT_FOUND;
|
|
else {
|
|
error = table->file->ha_index_read_idx_map(
|
|
table->record[0], ref->key, ref->key_buff,
|
|
make_prev_keypart_map(ref->key_parts), HA_READ_KEY_EXACT);
|
|
}
|
|
if (error) {
|
|
if (error != HA_ERR_KEY_NOT_FOUND && error != HA_ERR_END_OF_FILE) {
|
|
const int ret = report_handler_error(table, error);
|
|
return ret;
|
|
}
|
|
table->set_no_row();
|
|
table->set_null_row();
|
|
empty_record(table);
|
|
return -1;
|
|
}
|
|
/*
|
|
read_const() may be called several times inside a nested loop join.
|
|
Save record in case it is needed when table is in "started" state.
|
|
*/
|
|
store_record(table, record[1]);
|
|
} else if (table->has_row() && table->is_nullable()) {
|
|
/*
|
|
Row buffer contains a row, but it may have been partially overwritten
|
|
by a null-extended row. Restore the row from the saved copy.
|
|
*/
|
|
table->set_found_row();
|
|
restore_record(table, record[1]);
|
|
}
|
|
return table->has_row() ? 0 : -1;
|
|
}
|
|
|
|
EQRefIterator::EQRefIterator(THD *thd, TABLE *table, TABLE_REF *ref,
|
|
bool use_order, ha_rows *examined_rows)
|
|
: TableRowIterator(thd, table),
|
|
m_ref(ref),
|
|
m_use_order(use_order),
|
|
m_examined_rows(examined_rows) {}
|
|
|
|
/**
|
|
Read row using unique key: eq_ref access method implementation
|
|
|
|
@details
|
|
This is the "read_first" function for the eq_ref access method.
|
|
The difference from ref access function is that it has a one-element
|
|
lookup cache, maintained in record[0]. Since the eq_ref access method
|
|
will always return the same row, it is not necessary to read the row
|
|
more than once, regardless of how many times it is needed in execution.
|
|
This cache element is used when a row is needed after it has been read once,
|
|
unless a key conversion error has occurred, or the cache has been disabled.
|
|
|
|
@retval 0 - Ok
|
|
@retval -1 - Row not found
|
|
@retval 1 - Error
|
|
*/
|
|
|
|
bool EQRefIterator::Init() {
|
|
if (!table()->file->inited) {
|
|
DBUG_ASSERT(!m_use_order); // Don't expect sort req. for single row.
|
|
int error = table()->file->ha_index_init(m_ref->key, m_use_order);
|
|
if (error) {
|
|
PrintError(error);
|
|
return true;
|
|
}
|
|
}
|
|
|
|
m_first_record_since_init = true;
|
|
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
Read row using unique key: eq_ref access method implementation
|
|
|
|
@details
|
|
The difference from RefIterator is that it has a one-element
|
|
lookup cache, maintained in record[0]. Since the eq_ref access method
|
|
will always return the same row, it is not necessary to read the row
|
|
more than once, regardless of how many times it is needed in execution.
|
|
This cache element is used when a row is needed after it has been read once,
|
|
unless a key conversion error has occurred, or the cache has been disabled.
|
|
|
|
@retval 0 - Ok
|
|
@retval -1 - Row not found
|
|
@retval 1 - Error
|
|
*/
|
|
|
|
int EQRefIterator::Read() {
|
|
if (!m_first_record_since_init) {
|
|
return -1;
|
|
}
|
|
m_first_record_since_init = false;
|
|
|
|
/*
|
|
Calculate if needed to read row. Always needed if
|
|
- no rows read yet, or
|
|
- table has a pushed condition, or
|
|
- cache is disabled, or
|
|
- previous lookup caused error when calculating key.
|
|
*/
|
|
bool read_row = !table()->is_started() || table()->file->pushed_cond ||
|
|
m_ref->disable_cache || m_ref->key_err;
|
|
if (!read_row)
|
|
// Last lookup found a row, copy its key to secondary buffer
|
|
memcpy(m_ref->key_buff2, m_ref->key_buff, m_ref->key_length);
|
|
|
|
// Create new key for lookup
|
|
m_ref->key_err = cp_buffer_from_ref(table()->in_use, table(), m_ref);
|
|
if (m_ref->key_err) {
|
|
table()->set_no_row();
|
|
return -1;
|
|
}
|
|
|
|
// Re-use current row if keys are equal
|
|
if (!read_row &&
|
|
memcmp(m_ref->key_buff2, m_ref->key_buff, m_ref->key_length) != 0)
|
|
read_row = true;
|
|
|
|
if (read_row) {
|
|
/*
|
|
Moving away from the current record. Unlock the row
|
|
in the handler if it did not match the partial WHERE.
|
|
*/
|
|
if (table()->has_row() && m_ref->use_count == 0)
|
|
table()->file->unlock_row();
|
|
|
|
/*
|
|
Perform "Late NULLs Filtering" (see internals manual for explanations)
|
|
|
|
As EQRefIterator effectively implements a one row cache of last
|
|
fetched row, the NULLs filtering cant be done until after the cache
|
|
key has been checked and updated, and row locks maintained.
|
|
*/
|
|
if (m_ref->impossible_null_ref()) {
|
|
DBUG_PRINT("info", ("EQRefIterator null_rejected"));
|
|
table()->set_no_row();
|
|
return -1;
|
|
}
|
|
|
|
int error = table()->file->ha_index_read_map(
|
|
table()->record[0], m_ref->key_buff,
|
|
make_prev_keypart_map(m_ref->key_parts), HA_READ_KEY_EXACT);
|
|
if (error) {
|
|
return HandleError(error);
|
|
}
|
|
|
|
m_ref->use_count = 1;
|
|
table()->save_null_flags();
|
|
} else if (table()->has_row()) {
|
|
DBUG_ASSERT(!table()->has_null_row());
|
|
table()->restore_null_flags();
|
|
m_ref->use_count++;
|
|
}
|
|
|
|
if (table()->has_row() && m_examined_rows != nullptr) {
|
|
++*m_examined_rows;
|
|
}
|
|
return table()->has_row() ? 0 : -1;
|
|
}
|
|
|
|
/**
|
|
Since EQRefIterator may buffer a record, do not unlock
|
|
it if it was not used in this invocation of EQRefIterator::Read().
|
|
Only count locks, thus remembering if the record was left unused,
|
|
and unlock already when pruning the current value of
|
|
TABLE_REF buffer.
|
|
@sa EQRefIterator::Read()
|
|
*/
|
|
|
|
void EQRefIterator::UnlockRow() {
|
|
DBUG_ASSERT(m_ref->use_count);
|
|
if (m_ref->use_count) m_ref->use_count--;
|
|
}
|
|
|
|
vector<string> EQRefIterator::DebugString() const {
|
|
const KEY *key = &table()->key_info[m_ref->key];
|
|
string str = string("Single-row index lookup on ") + table()->alias +
|
|
" using " + key->name + " (" +
|
|
RefToString(*m_ref, key, /*include_nulls=*/false) + ")";
|
|
if (table()->file->pushed_idx_cond != nullptr) {
|
|
str += ", with index condition: " +
|
|
ItemToString(table()->file->pushed_idx_cond);
|
|
}
|
|
str += table()->file->explain_extra();
|
|
return {str};
|
|
}
|
|
|
|
PushedJoinRefIterator::PushedJoinRefIterator(THD *thd, TABLE *table,
|
|
TABLE_REF *ref, bool use_order,
|
|
ha_rows *examined_rows)
|
|
: TableRowIterator(thd, table),
|
|
m_ref(ref),
|
|
m_use_order(use_order),
|
|
m_examined_rows(examined_rows) {}
|
|
|
|
bool PushedJoinRefIterator::Init() {
|
|
DBUG_ASSERT(!m_use_order); // Pushed child can't be sorted
|
|
|
|
if (!table()->file->inited) {
|
|
int error = table()->file->ha_index_init(m_ref->key, m_use_order);
|
|
if (error) {
|
|
PrintError(error);
|
|
return true;
|
|
}
|
|
}
|
|
|
|
m_first_record_since_init = true;
|
|
return false;
|
|
}
|
|
|
|
int PushedJoinRefIterator::Read() {
|
|
if (m_first_record_since_init) {
|
|
m_first_record_since_init = false;
|
|
|
|
/* Perform "Late NULLs Filtering" (see internals manual for explanations) */
|
|
if (m_ref->impossible_null_ref()) {
|
|
table()->set_no_row();
|
|
DBUG_PRINT("info", ("PushedJoinRefIterator::Read() null_rejected"));
|
|
return -1;
|
|
}
|
|
|
|
if (cp_buffer_from_ref(thd(), table(), m_ref)) {
|
|
table()->set_no_row();
|
|
return -1;
|
|
}
|
|
|
|
// 'read' itself is a NOOP:
|
|
// handler::ha_index_read_pushed() only unpack the prefetched row and
|
|
// set 'status'
|
|
int error = table()->file->ha_index_read_pushed(
|
|
table()->record[0], m_ref->key_buff,
|
|
make_prev_keypart_map(m_ref->key_parts));
|
|
if (error) {
|
|
return HandleError(error);
|
|
}
|
|
} else {
|
|
int error = table()->file->ha_index_next_pushed(table()->record[0]);
|
|
if (error) {
|
|
return HandleError(error);
|
|
}
|
|
}
|
|
if (m_examined_rows != nullptr) {
|
|
++*m_examined_rows;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
vector<string> PushedJoinRefIterator::DebugString() const {
|
|
DBUG_ASSERT(table()->file->pushed_idx_cond == nullptr);
|
|
const KEY *key = &table()->key_info[m_ref->key];
|
|
return {string("Index lookup on ") + table()->alias + " using " + key->name +
|
|
" (" + RefToString(*m_ref, key, /*include_nulls=*/false) + ")" +
|
|
table()->file->explain_extra()};
|
|
}
|
|
|
|
template <bool Reverse>
|
|
RefIterator<Reverse>::RefIterator(THD *thd, TABLE *table, TABLE_REF *ref,
|
|
bool use_order, QEP_TAB *qep_tab,
|
|
ha_rows *examined_rows)
|
|
: TableRowIterator(thd, table),
|
|
m_ref(ref),
|
|
m_use_order(use_order),
|
|
m_qep_tab(qep_tab),
|
|
m_examined_rows(examined_rows) {}
|
|
|
|
template <bool Reverse>
|
|
bool RefIterator<Reverse>::Init() {
|
|
m_first_record_since_init = true;
|
|
return init_index_and_record_buffer(m_qep_tab, m_qep_tab->table()->file,
|
|
m_ref->key, m_use_order);
|
|
}
|
|
|
|
template <bool Reverse>
|
|
vector<string> RefIterator<Reverse>::DebugString() const {
|
|
const KEY *key = &table()->key_info[m_ref->key];
|
|
string str = string("Index lookup on ") + table()->alias + " using " +
|
|
key->name + " (" +
|
|
RefToString(*m_ref, key, /*include_nulls=*/false);
|
|
if (Reverse) {
|
|
str += "; iterate backwards";
|
|
}
|
|
str += ")";
|
|
if (table()->file->pushed_idx_cond != nullptr) {
|
|
str += ", with index condition: " +
|
|
ItemToString(table()->file->pushed_idx_cond);
|
|
}
|
|
str += table()->file->explain_extra();
|
|
return {str};
|
|
}
|
|
|
|
// Doxygen gets confused by the explicit specializations.
|
|
|
|
//! @cond
|
|
template <>
|
|
int RefIterator<false>::Read() { // Forward read.
|
|
if (m_first_record_since_init) {
|
|
m_first_record_since_init = false;
|
|
|
|
/*
|
|
a = b can never return true if a or b is NULL, so if we're asked
|
|
to do such a lookup, we can say there won't be a match without even
|
|
checking the index. This is “late NULLs filtering” (as opposed to
|
|
“early NULLs filtering”, which propagates the IS NOT NULL constraint
|
|
further back to the other table so we don't even get the request).
|
|
See the internals manual for more details.
|
|
*/
|
|
if (m_ref->impossible_null_ref()) {
|
|
DBUG_PRINT("info", ("RefIterator null_rejected"));
|
|
table()->set_no_row();
|
|
return -1;
|
|
}
|
|
if (cp_buffer_from_ref(thd(), table(), m_ref)) {
|
|
table()->set_no_row();
|
|
return -1;
|
|
}
|
|
int error = table()->file->ha_index_read_map(
|
|
table()->record[0], m_ref->key_buff,
|
|
make_prev_keypart_map(m_ref->key_parts), HA_READ_KEY_EXACT);
|
|
if (error) {
|
|
return HandleError(error);
|
|
}
|
|
} else {
|
|
int error = table()->file->ha_index_next_same(
|
|
table()->record[0], m_ref->key_buff, m_ref->key_length);
|
|
if (error) {
|
|
return HandleError(error);
|
|
}
|
|
}
|
|
if (m_examined_rows != nullptr) {
|
|
++*m_examined_rows;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
This function is used when optimizing away ORDER BY in
|
|
SELECT * FROM t1 WHERE a=1 ORDER BY a DESC,b DESC.
|
|
*/
|
|
template <>
|
|
int RefIterator<true>::Read() { // Reverse read.
|
|
if (m_first_record_since_init) {
|
|
m_first_record_since_init = false;
|
|
|
|
/*
|
|
a = b can never return true if a or b is NULL, so if we're asked
|
|
to do such a lookup, we can say there won't be a match without even
|
|
checking the index. This is “late NULLs filtering” (as opposed to
|
|
“early NULLs filtering”, which propagates the IS NOT NULL constraint
|
|
further back to the other table so we don't even get the request).
|
|
See the internals manual for more details.
|
|
*/
|
|
if (m_ref->impossible_null_ref()) {
|
|
DBUG_PRINT("info", ("RefIterator null_rejected"));
|
|
table()->set_no_row();
|
|
return -1;
|
|
}
|
|
if (cp_buffer_from_ref(thd(), table(), m_ref)) {
|
|
table()->set_no_row();
|
|
return -1;
|
|
}
|
|
int error = table()->file->ha_index_read_last_map(
|
|
table()->record[0], m_ref->key_buff,
|
|
make_prev_keypart_map(m_ref->key_parts));
|
|
if (error) {
|
|
return HandleError(error);
|
|
}
|
|
} else {
|
|
/*
|
|
Using ha_index_prev() for reading records from the table can cause
|
|
performance issues if used in combination with ICP. The ICP code
|
|
in the storage engine does not know when to stop reading from the
|
|
index and a call to ha_index_prev() might cause the storage engine
|
|
to read to the beginning of the index if no qualifying record is
|
|
found.
|
|
*/
|
|
DBUG_ASSERT(table()->file->pushed_idx_cond == NULL);
|
|
int error = table()->file->ha_index_prev(table()->record[0]);
|
|
if (error) {
|
|
return HandleError(error);
|
|
}
|
|
if (key_cmp_if_same(table(), m_ref->key_buff, m_ref->key,
|
|
m_ref->key_length)) {
|
|
table()->set_no_row();
|
|
return -1;
|
|
}
|
|
}
|
|
if (m_examined_rows != nullptr) {
|
|
++*m_examined_rows;
|
|
}
|
|
return 0;
|
|
}
|
|
//! @endcond
|
|
|
|
DynamicRangeIterator::DynamicRangeIterator(THD *thd, TABLE *table,
|
|
QEP_TAB *qep_tab,
|
|
ha_rows *examined_rows)
|
|
: TableRowIterator(thd, table),
|
|
m_qep_tab(qep_tab),
|
|
m_examined_rows(examined_rows) {}
|
|
|
|
bool DynamicRangeIterator::Init() {
|
|
// The range optimizer generally expects this to be set.
|
|
thd()->lex->set_current_select(m_qep_tab->join()->select_lex);
|
|
|
|
Opt_trace_context *const trace = &thd()->opt_trace;
|
|
const bool disable_trace =
|
|
m_quick_traced_before &&
|
|
!trace->feature_enabled(Opt_trace_context::DYNAMIC_RANGE);
|
|
Opt_trace_disable_I_S disable_trace_wrapper(trace, disable_trace);
|
|
|
|
m_quick_traced_before = true;
|
|
|
|
Opt_trace_object wrapper(trace);
|
|
Opt_trace_object trace_table(trace, "rows_estimation_per_outer_row");
|
|
trace_table.add_utf8_table(m_qep_tab->table_ref);
|
|
|
|
Key_map needed_reg_dummy;
|
|
QUICK_SELECT_I *old_qck = m_qep_tab->quick();
|
|
QUICK_SELECT_I *qck;
|
|
DEBUG_SYNC(thd(), "quick_not_created");
|
|
const int rc = test_quick_select(thd(), m_qep_tab->keys(),
|
|
0, // empty table map
|
|
HA_POS_ERROR,
|
|
false, // don't force quick range
|
|
ORDER_NOT_RELEVANT, m_qep_tab,
|
|
m_qep_tab->condition(), &needed_reg_dummy,
|
|
&qck, m_qep_tab->table()->force_index);
|
|
if (thd()->is_error()) // @todo consolidate error reporting of
|
|
// test_quick_select
|
|
return true;
|
|
DBUG_ASSERT(old_qck == NULL || old_qck != qck);
|
|
m_qep_tab->set_quick(qck);
|
|
|
|
/*
|
|
EXPLAIN CONNECTION is used to understand why a query is currently taking
|
|
so much time. So it makes sense to show what the execution is doing now:
|
|
is it a table scan or a range scan? A range scan on which index.
|
|
So: below we want to change the type and quick visible in EXPLAIN, and for
|
|
that, we need to take mutex and change type and quick_optim.
|
|
*/
|
|
|
|
DEBUG_SYNC(thd(), "quick_created_before_mutex");
|
|
|
|
thd()->lock_query_plan();
|
|
m_qep_tab->set_type(qck ? calc_join_type(qck->get_type()) : JT_ALL);
|
|
m_qep_tab->set_quick_optim();
|
|
thd()->unlock_query_plan();
|
|
|
|
delete old_qck;
|
|
DEBUG_SYNC(thd(), "quick_droped_after_mutex");
|
|
|
|
// Clear out and destroy any old iterators before we start constructing
|
|
// new ones, since they may share the same memory in the union.
|
|
m_iterator.reset();
|
|
|
|
if (rc == -1) {
|
|
return false;
|
|
}
|
|
|
|
if (qck) {
|
|
m_iterator = NewIterator<IndexRangeScanIterator>(
|
|
thd(), table(), qck, m_qep_tab, m_examined_rows);
|
|
} else {
|
|
m_iterator = NewIterator<TableScanIterator>(thd(), table(), m_qep_tab,
|
|
m_examined_rows);
|
|
}
|
|
return m_iterator->Init();
|
|
}
|
|
|
|
int DynamicRangeIterator::Read() {
|
|
if (m_iterator == nullptr) {
|
|
return -1;
|
|
} else {
|
|
return m_iterator->Read();
|
|
}
|
|
}
|
|
|
|
vector<string> DynamicRangeIterator::DebugString() const {
|
|
// TODO: Convert QUICK_SELECT_I to RowIterator so that we can get
|
|
// better outputs here (similar to dbug_dump()), although it might
|
|
// get tricky when there are many alternatives.
|
|
string str = string("Index range scan on ") + table()->alias +
|
|
" (re-planned for each iteration)";
|
|
if (table()->file->pushed_idx_cond != nullptr) {
|
|
str += ", with index condition: " +
|
|
ItemToString(table()->file->pushed_idx_cond);
|
|
}
|
|
str += table()->file->explain_extra();
|
|
return {str};
|
|
}
|
|
|
|
/**
|
|
@brief Prepare table for reading rows and read first record.
|
|
@details
|
|
Prior to reading the table following tasks are done, (in the order of
|
|
execution):
|
|
.) derived tables are materialized
|
|
.) pre-iterator executor only: duplicates removed (tmp tables only)
|
|
.) table is sorted with filesort (both non-tmp and tmp tables)
|
|
After this have been done this function resets quick select, if it's
|
|
present, sets up table reading functions, and reads first record.
|
|
|
|
@retval
|
|
0 Ok
|
|
@retval
|
|
-1 End of records
|
|
@retval
|
|
1 Error
|
|
*/
|
|
|
|
void join_setup_iterator(QEP_TAB *tab) {
|
|
bool using_table_scan;
|
|
tab->iterator =
|
|
create_table_iterator(tab->join()->thd, NULL, tab, false,
|
|
/*ignore_not_found_rows=*/false,
|
|
/*examined_rows=*/nullptr, &using_table_scan);
|
|
tab->set_using_table_scan(using_table_scan);
|
|
|
|
if (tab->filesort) {
|
|
unique_ptr_destroy_only<RowIterator> iterator = move(tab->iterator);
|
|
|
|
if (tab->condition()) {
|
|
iterator = NewIterator<FilterIterator>(tab->join()->thd, move(iterator),
|
|
tab->condition());
|
|
}
|
|
|
|
// Wrap the chosen RowIterator in a SortingIterator, so that we get
|
|
// sorted results out.
|
|
tab->iterator = NewIterator<SortingIterator>(tab->join()->thd,
|
|
tab->filesort, move(iterator),
|
|
&tab->join()->examined_rows);
|
|
tab->table()->sorting_iterator =
|
|
down_cast<SortingIterator *>(tab->iterator->real_iterator());
|
|
}
|
|
}
|
|
|
|
/*
|
|
This helper function materializes derived table/view and then calls
|
|
read_first_record function to set up access to the materialized table.
|
|
*/
|
|
|
|
int join_materialize_table_function(QEP_TAB *tab) {
|
|
TABLE_LIST *const table = tab->table_ref;
|
|
DBUG_ASSERT(table->table_function);
|
|
|
|
(void)table->table_function->fill_result_table();
|
|
|
|
return table->table->in_use->is_error() ? NESTED_LOOP_ERROR : NESTED_LOOP_OK;
|
|
}
|
|
|
|
/*
|
|
This helper function materializes derived table/view and then calls
|
|
read_first_record function to set up access to the materialized table.
|
|
*/
|
|
|
|
int join_materialize_derived(QEP_TAB *tab) {
|
|
THD *const thd = tab->table()->in_use;
|
|
TABLE_LIST *const derived = tab->table_ref;
|
|
|
|
DBUG_ASSERT(derived->uses_materialization() && !tab->table()->materialized);
|
|
|
|
if (derived->materializable_is_const()) // Has been materialized by optimizer
|
|
return NESTED_LOOP_OK;
|
|
|
|
bool res = derived->materialize_derived(thd);
|
|
res |= derived->cleanup_derived(thd);
|
|
DEBUG_SYNC(thd, "after_materialize_derived");
|
|
return res ? NESTED_LOOP_ERROR : NESTED_LOOP_OK;
|
|
}
|
|
|
|
/*
|
|
Helper function for materialization of a semi-joined subquery.
|
|
|
|
@param tab JOIN_TAB referencing a materialized semi-join table
|
|
|
|
@return Nested loop state
|
|
*/
|
|
|
|
int join_materialize_semijoin(QEP_TAB *tab) {
|
|
DBUG_TRACE;
|
|
|
|
Semijoin_mat_exec *const sjm = tab->sj_mat_exec();
|
|
|
|
QEP_TAB *const first = tab->join()->qep_tab + sjm->inner_table_index;
|
|
QEP_TAB *const last = first + (sjm->table_count - 1);
|
|
/*
|
|
Set up the end_sj_materialize function after the last inner table,
|
|
so that generated rows are inserted into the materialized table.
|
|
*/
|
|
last->next_select = end_sj_materialize;
|
|
last->set_sj_mat_exec(sjm); // TODO: This violates comment for sj_mat_exec!
|
|
if (tab->table()->hash_field) tab->table()->file->ha_index_init(0, 0);
|
|
int rc;
|
|
if ((rc = sub_select(tab->join(), first, false)) < 0) return rc;
|
|
if ((rc = sub_select(tab->join(), first, true)) < 0) return rc;
|
|
if (tab->table()->hash_field) tab->table()->file->ha_index_or_rnd_end();
|
|
|
|
last->next_select = NULL;
|
|
last->set_sj_mat_exec(NULL);
|
|
|
|
#if !defined(DBUG_OFF) || defined(HAVE_VALGRIND)
|
|
// Fields of inner tables should not be read anymore:
|
|
for (QEP_TAB *t = first; t <= last; t++) {
|
|
// Rows may persist across executions for these types:
|
|
if (t->type() == JT_EQ_REF || t->type() == JT_CONST ||
|
|
t->type() == JT_SYSTEM)
|
|
continue;
|
|
TABLE *const inner_table = t->table();
|
|
TRASH(inner_table->record[0], inner_table->s->reclength);
|
|
}
|
|
#endif
|
|
|
|
tab->table()->materialized = true;
|
|
return NESTED_LOOP_OK;
|
|
}
|
|
|
|
/**
|
|
Check if access to this JOIN_TAB has to retrieve rows
|
|
in sorted order as defined by the ordered index
|
|
used to access this table.
|
|
*/
|
|
bool QEP_TAB::use_order() const {
|
|
/*
|
|
No need to require sorted access for single row reads
|
|
being performed by const- or EQ_REF-accessed tables.
|
|
*/
|
|
if (type() == JT_EQ_REF || type() == JT_CONST || type() == JT_SYSTEM)
|
|
return false;
|
|
|
|
/*
|
|
First non-const table requires sorted results
|
|
if ORDER or GROUP BY use ordered index.
|
|
*/
|
|
if ((uint)idx() == join()->const_tables &&
|
|
join()->m_ordered_index_usage != JOIN::ORDERED_INDEX_VOID)
|
|
return true;
|
|
|
|
/*
|
|
LooseScan strategy for semijoin requires sorted
|
|
results even if final result is not to be sorted.
|
|
*/
|
|
if (position()->sj_strategy == SJ_OPT_LOOSE_SCAN) return true;
|
|
|
|
/* Fall through: Results don't have to be sorted */
|
|
return false;
|
|
}
|
|
|
|
FullTextSearchIterator::FullTextSearchIterator(THD *thd, TABLE *table,
|
|
TABLE_REF *ref, bool use_order,
|
|
ha_rows *examined_rows)
|
|
: TableRowIterator(thd, table),
|
|
m_ref(ref),
|
|
m_use_order(use_order),
|
|
m_examined_rows(examined_rows) {}
|
|
|
|
FullTextSearchIterator::~FullTextSearchIterator() {
|
|
table()->file->ha_index_or_rnd_end();
|
|
}
|
|
|
|
bool FullTextSearchIterator::Init() {
|
|
if (!table()->file->inited) {
|
|
int error = table()->file->ha_index_init(m_ref->key, m_use_order);
|
|
if (error) {
|
|
PrintError(error);
|
|
return true;
|
|
}
|
|
}
|
|
table()->file->ft_init();
|
|
return false;
|
|
}
|
|
|
|
int FullTextSearchIterator::Read() {
|
|
int error = table()->file->ha_ft_read(table()->record[0]);
|
|
if (error) {
|
|
return HandleError(error);
|
|
}
|
|
if (m_examined_rows != nullptr) {
|
|
++*m_examined_rows;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
vector<string> FullTextSearchIterator::DebugString() const {
|
|
DBUG_ASSERT(table()->file->pushed_idx_cond == nullptr);
|
|
const KEY *key = &table()->key_info[m_ref->key];
|
|
return {string("Indexed full text search on ") + table()->alias + " using " +
|
|
key->name + " (" + RefToString(*m_ref, key, /*include_nulls=*/false) +
|
|
")" + table()->file->explain_extra()};
|
|
}
|
|
|
|
/**
|
|
Reading of key with key reference and one part that may be NULL.
|
|
*/
|
|
|
|
RefOrNullIterator::RefOrNullIterator(THD *thd, TABLE *table, TABLE_REF *ref,
|
|
bool use_order, QEP_TAB *qep_tab,
|
|
ha_rows *examined_rows)
|
|
: TableRowIterator(thd, table),
|
|
m_ref(ref),
|
|
m_use_order(use_order),
|
|
m_qep_tab(qep_tab),
|
|
m_examined_rows(examined_rows) {}
|
|
|
|
bool RefOrNullIterator::Init() {
|
|
m_reading_first_row = true;
|
|
*m_ref->null_ref_key = false;
|
|
return init_index_and_record_buffer(m_qep_tab, m_qep_tab->table()->file,
|
|
m_ref->key, m_use_order);
|
|
}
|
|
|
|
int RefOrNullIterator::Read() {
|
|
if (m_reading_first_row && !*m_ref->null_ref_key) {
|
|
/* Perform "Late NULLs Filtering" (see internals manual for explanations)
|
|
*/
|
|
if (m_ref->impossible_null_ref() ||
|
|
cp_buffer_from_ref(thd(), table(), m_ref)) {
|
|
// Skip searching for non-NULL rows; go straight to NULL rows.
|
|
*m_ref->null_ref_key = true;
|
|
}
|
|
}
|
|
|
|
int error;
|
|
if (m_reading_first_row) {
|
|
m_reading_first_row = false;
|
|
error = table()->file->ha_index_read_map(
|
|
table()->record[0], m_ref->key_buff,
|
|
make_prev_keypart_map(m_ref->key_parts), HA_READ_KEY_EXACT);
|
|
} else {
|
|
error = table()->file->ha_index_next_same(
|
|
table()->record[0], m_ref->key_buff, m_ref->key_length);
|
|
}
|
|
|
|
if (error == 0) {
|
|
if (m_examined_rows != nullptr) {
|
|
++*m_examined_rows;
|
|
}
|
|
return 0;
|
|
} else if (error == HA_ERR_END_OF_FILE || error == HA_ERR_KEY_NOT_FOUND) {
|
|
if (!*m_ref->null_ref_key) {
|
|
// No more non-NULL rows; try again with NULL rows.
|
|
*m_ref->null_ref_key = true;
|
|
m_reading_first_row = true;
|
|
return Read();
|
|
} else {
|
|
// Real EOF.
|
|
table()->set_no_row();
|
|
return -1;
|
|
}
|
|
} else {
|
|
return HandleError(error);
|
|
}
|
|
}
|
|
|
|
vector<string> RefOrNullIterator::DebugString() const {
|
|
const KEY *key = &table()->key_info[m_ref->key];
|
|
string str = string("Index lookup on ") + table()->alias + " using " +
|
|
key->name + " (" +
|
|
RefToString(*m_ref, key, /*include_nulls=*/true) + ")";
|
|
if (table()->file->pushed_idx_cond != nullptr) {
|
|
str += ", with index condition: " +
|
|
ItemToString(table()->file->pushed_idx_cond);
|
|
}
|
|
str += table()->file->explain_extra();
|
|
return {str};
|
|
}
|
|
|
|
AlternativeIterator::AlternativeIterator(
|
|
THD *thd, TABLE *table, QEP_TAB *qep_tab, ha_rows *examined_rows,
|
|
unique_ptr_destroy_only<RowIterator> source, TABLE_REF *ref)
|
|
: RowIterator(thd),
|
|
m_ref(ref),
|
|
m_source_iterator(std::move(source)),
|
|
m_table_scan_iterator(
|
|
NewIterator<TableScanIterator>(thd, table, qep_tab, examined_rows)) {
|
|
for (unsigned key_part_idx = 0; key_part_idx < ref->key_parts;
|
|
++key_part_idx) {
|
|
bool *cond_guard = ref->cond_guards[key_part_idx];
|
|
if (cond_guard != nullptr) {
|
|
m_applicable_cond_guards.push_back(cond_guard);
|
|
}
|
|
}
|
|
DBUG_ASSERT(!m_applicable_cond_guards.empty());
|
|
}
|
|
|
|
bool AlternativeIterator::Init() {
|
|
m_iterator = m_source_iterator.get();
|
|
for (bool *cond_guard : m_applicable_cond_guards) {
|
|
if (!*cond_guard) {
|
|
m_iterator = m_table_scan_iterator.get();
|
|
break;
|
|
}
|
|
}
|
|
return m_iterator->Init();
|
|
}
|
|
|
|
vector<string> AlternativeIterator::DebugString() const {
|
|
const TABLE *table =
|
|
down_cast<TableScanIterator *>(m_table_scan_iterator->real_iterator())
|
|
->table();
|
|
const KEY *key = &table->key_info[m_ref->key];
|
|
string ret = "Alternative plans for IN subquery: Index lookup unless ";
|
|
if (m_applicable_cond_guards.size() > 1) {
|
|
ret += " any of (";
|
|
}
|
|
bool first = true;
|
|
for (unsigned key_part_idx = 0; key_part_idx < m_ref->key_parts;
|
|
++key_part_idx) {
|
|
if (m_ref->cond_guards[key_part_idx] == nullptr) {
|
|
continue;
|
|
}
|
|
if (!first) {
|
|
ret += ", ";
|
|
}
|
|
first = false;
|
|
ret += key->key_part[key_part_idx].field->field_name;
|
|
}
|
|
if (m_applicable_cond_guards.size() > 1) {
|
|
ret += ")";
|
|
}
|
|
ret += " IS NULL";
|
|
return {ret};
|
|
}
|
|
|
|
/**
|
|
Pick the appropriate access method functions
|
|
|
|
Sets the functions for the selected table access method
|
|
*/
|
|
|
|
void QEP_TAB::pick_table_access_method() {
|
|
DBUG_ASSERT(table());
|
|
// Only some access methods support reversed access:
|
|
DBUG_ASSERT(!m_reversed_access || type() == JT_REF ||
|
|
type() == JT_INDEX_SCAN);
|
|
TABLE_REF *used_ref = nullptr;
|
|
|
|
const TABLE *pushed_root = table()->file->member_of_pushed_join();
|
|
const bool is_pushed_child = (pushed_root && pushed_root != table());
|
|
// A 'pushed_child' has to be a REF type
|
|
DBUG_ASSERT(!is_pushed_child || type() == JT_REF || type() == JT_EQ_REF);
|
|
|
|
switch (type()) {
|
|
case JT_REF:
|
|
if (is_pushed_child) {
|
|
DBUG_ASSERT(!m_reversed_access);
|
|
iterator = NewIterator<PushedJoinRefIterator>(
|
|
join()->thd, table(), &ref(), use_order(), &join()->examined_rows);
|
|
} else if (m_reversed_access) {
|
|
iterator = NewIterator<RefIterator<true>>(join()->thd, table(), &ref(),
|
|
use_order(), this,
|
|
&join()->examined_rows);
|
|
} else {
|
|
iterator = NewIterator<RefIterator<false>>(join()->thd, table(), &ref(),
|
|
use_order(), this,
|
|
&join()->examined_rows);
|
|
}
|
|
used_ref = &ref();
|
|
break;
|
|
|
|
case JT_REF_OR_NULL:
|
|
iterator = NewIterator<RefOrNullIterator>(join()->thd, table(), &ref(),
|
|
use_order(), this,
|
|
&join()->examined_rows);
|
|
used_ref = &ref();
|
|
break;
|
|
|
|
case JT_CONST:
|
|
iterator = NewIterator<ConstIterator>(join()->thd, table(), &ref(),
|
|
&join()->examined_rows);
|
|
break;
|
|
|
|
case JT_EQ_REF:
|
|
if (is_pushed_child) {
|
|
iterator = NewIterator<PushedJoinRefIterator>(
|
|
join()->thd, table(), &ref(), use_order(), &join()->examined_rows);
|
|
} else {
|
|
iterator = NewIterator<EQRefIterator>(
|
|
join()->thd, table(), &ref(), use_order(), &join()->examined_rows);
|
|
}
|
|
used_ref = &ref();
|
|
break;
|
|
|
|
case JT_FT:
|
|
iterator = NewIterator<FullTextSearchIterator>(
|
|
join()->thd, table(), &ref(), use_order(), &join()->examined_rows);
|
|
used_ref = &ref();
|
|
break;
|
|
|
|
case JT_INDEX_SCAN:
|
|
if (m_reversed_access) {
|
|
iterator = NewIterator<IndexScanIterator<true>>(
|
|
join()->thd, table(), index(), use_order(), this,
|
|
&join()->examined_rows);
|
|
} else {
|
|
iterator = NewIterator<IndexScanIterator<false>>(
|
|
join()->thd, table(), index(), use_order(), this,
|
|
&join()->examined_rows);
|
|
}
|
|
break;
|
|
case JT_ALL:
|
|
case JT_RANGE:
|
|
case JT_INDEX_MERGE:
|
|
if (using_dynamic_range) {
|
|
iterator = NewIterator<DynamicRangeIterator>(join()->thd, table(), this,
|
|
&join()->examined_rows);
|
|
} else {
|
|
iterator =
|
|
create_table_iterator(join()->thd, NULL, this, false,
|
|
/*ignore_not_found_rows=*/false,
|
|
&join()->examined_rows, &m_using_table_scan);
|
|
}
|
|
break;
|
|
default:
|
|
DBUG_ASSERT(0);
|
|
break;
|
|
}
|
|
|
|
/*
|
|
If we have an item like <expr> IN ( SELECT f2 FROM t2 ), and we were not
|
|
able to rewrite it into a semijoin, the optimizer may rewrite it into
|
|
EXISTS ( SELECT 1 FROM t2 WHERE f2=<expr> LIMIT 1 ) (ie., pushing down the
|
|
value into the subquery), using a REF or REF_OR_NULL scan on t2 if possible.
|
|
This happens in Item_in_subselect::select_in_like_transformer() and the
|
|
functions it calls.
|
|
|
|
However, if <expr> evaluates to NULL, this transformation is incorrect,
|
|
and the transformation used should instead be to
|
|
|
|
EXISTS ( SELECT 1 FROM t2 LIMIT 1 ) ? NULL : FALSE.
|
|
|
|
Thus, in the case of nullable <expr>, the rewriter inserts so-called
|
|
“condition guards” (pointers to bool saying whether <expr> was NULL or not,
|
|
for each part of <expr> if it contains multiple columns). These condition
|
|
guards do two things:
|
|
|
|
1. They disable the pushed-down WHERE clauses.
|
|
2. They change the REF/REF_OR_NULL accesses to table scans.
|
|
|
|
We don't need to worry about #1 here, but #2 needs to be dealt with,
|
|
as it changes the plan. We solve it by inserting an AlternativeIterator
|
|
that chooses between two sub-iterators at execution time, based on the
|
|
condition guard in question.
|
|
|
|
Note that ideally, we'd plan a completely separate plan for the NULL case,
|
|
as there might be e.g. a different index we could scan on, or even a
|
|
different optimal join order. (Note, however, that for the case of multiple
|
|
columns in the expression, we could get 2^N different plans.) However, given
|
|
that most cases are now handled by semijoins and not in2exists at all,
|
|
we don't need to jump through every possible hoop to optimize these cases.
|
|
*/
|
|
if (used_ref != nullptr) {
|
|
for (unsigned key_part_idx = 0; key_part_idx < used_ref->key_parts;
|
|
++key_part_idx) {
|
|
if (used_ref->cond_guards[key_part_idx] != nullptr) {
|
|
DBUG_ASSERT(!is_pushed_child);
|
|
// At least one condition guard is relevant, so we need to use
|
|
// the AlternativeIterator.
|
|
iterator = NewIterator<AlternativeIterator>(join()->thd, table(), this,
|
|
&join()->examined_rows,
|
|
move(iterator), used_ref);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/*****************************************************************************
|
|
DESCRIPTION
|
|
Functions that end one nested loop iteration. Different functions
|
|
are used to support GROUP BY clause and to redirect records
|
|
to a table (e.g. in case of SELECT into a temporary table) or to the
|
|
network client.
|
|
See the enum_nested_loop_state enumeration for the description of return
|
|
values.
|
|
*****************************************************************************/
|
|
|
|
/* ARGSUSED */
|
|
static enum_nested_loop_state end_send(JOIN *join, QEP_TAB *qep_tab,
|
|
bool end_of_records) {
|
|
DBUG_TRACE;
|
|
/*
|
|
When all tables are const this function is called with jointab == NULL.
|
|
This function shouldn't be called for the first join_tab as it needs
|
|
to get fields from previous tab.
|
|
|
|
Note that qep_tab may be one past the last of qep_tab! So don't read its
|
|
pointed content. But you can read qep_tab[-1] then.
|
|
*/
|
|
DBUG_ASSERT(qep_tab == NULL || qep_tab > join->qep_tab);
|
|
THD *thd = join->thd;
|
|
|
|
if (!end_of_records) {
|
|
int error;
|
|
int sliceno;
|
|
if (qep_tab) {
|
|
if (qep_tab - 1 == join->ref_slice_immediately_before_group_by) {
|
|
// Read Items from pseudo-table REF_SLICE_ORDERED_GROUP_BY
|
|
sliceno = REF_SLICE_ORDERED_GROUP_BY;
|
|
} else {
|
|
sliceno = qep_tab[-1].ref_item_slice;
|
|
}
|
|
} else {
|
|
// All-constant tables; no change of slice
|
|
sliceno = join->current_ref_item_slice;
|
|
}
|
|
Switch_ref_item_slice slice_switch(join, sliceno);
|
|
List<Item> *fields = join->get_current_fields();
|
|
if (join->tables &&
|
|
// In case filesort has been used and zeroed quick():
|
|
(join->qep_tab[0].quick_optim() &&
|
|
join->qep_tab[0].quick_optim()->is_loose_index_scan())) {
|
|
// Copy non-aggregated fields when loose index scan is used.
|
|
if (copy_fields(&join->tmp_table_param, thd))
|
|
return NESTED_LOOP_ERROR; /* purecov: inspected */
|
|
}
|
|
// Filter HAVING if not done earlier
|
|
if (!having_is_true(join->having_cond))
|
|
return NESTED_LOOP_OK; // Didn't match having
|
|
error = 0;
|
|
if (join->should_send_current_row())
|
|
error = join->select_lex->query_result()->send_data(thd, *fields);
|
|
if (error) return NESTED_LOOP_ERROR; /* purecov: inspected */
|
|
|
|
++join->send_records;
|
|
thd->get_stmt_da()->inc_current_row_for_condition();
|
|
if (join->send_records >= join->unit->select_limit_cnt &&
|
|
!join->do_send_rows) {
|
|
/*
|
|
If we have used Priority Queue for optimizing order by with limit,
|
|
then stop here, there are no more records to consume.
|
|
When this optimization is used, end_send is called on the next
|
|
join_tab.
|
|
*/
|
|
if (join->order && join->calc_found_rows && qep_tab > join->qep_tab &&
|
|
qep_tab[-1].filesort && qep_tab[-1].filesort->using_pq) {
|
|
DBUG_PRINT("info", ("filesort NESTED_LOOP_QUERY_LIMIT"));
|
|
return NESTED_LOOP_QUERY_LIMIT;
|
|
}
|
|
}
|
|
if (join->send_records >= join->unit->select_limit_cnt &&
|
|
join->do_send_rows) {
|
|
if (join->calc_found_rows) {
|
|
join->do_send_rows = 0;
|
|
if (join->unit->fake_select_lex)
|
|
join->unit->fake_select_lex->select_limit = 0;
|
|
return NESTED_LOOP_OK;
|
|
}
|
|
return NESTED_LOOP_QUERY_LIMIT; // Abort nicely
|
|
} else if (join->send_records >= join->fetch_limit) {
|
|
/*
|
|
There is a server side cursor and all rows for
|
|
this fetch request are sent.
|
|
*/
|
|
return NESTED_LOOP_CURSOR_LIMIT;
|
|
}
|
|
}
|
|
return NESTED_LOOP_OK;
|
|
}
|
|
|
|
/**
|
|
Get exact count of rows in all tables. When this is called, at least one
|
|
table's SE doesn't include HA_COUNT_ROWS_INSTANT.
|
|
|
|
@param qep_tab List of qep_tab in this JOIN.
|
|
@param table_count Count of qep_tab in the JOIN.
|
|
@param error [out] Return any possible error. Else return 0
|
|
|
|
@returns
|
|
Cartesian product of count of the rows in all tables if success
|
|
0 if error.
|
|
|
|
@note The "error" parameter is required for the sake of testcases like the
|
|
one in innodb-wl6742.test:272. Earlier if an error was raised by
|
|
ha_records, it wasn't handled by get_exact_record_count. Instead it was
|
|
just allowed to go to the execution phase, where end_send_group would
|
|
see the same error and raise it.
|
|
|
|
But with the new function 'end_send_count' in the execution phase,
|
|
such an error should be properly returned so that it can be raised.
|
|
*/
|
|
ulonglong get_exact_record_count(QEP_TAB *qep_tab, uint table_count,
|
|
int *error) {
|
|
ulonglong count = 1;
|
|
QEP_TAB *qt;
|
|
|
|
for (uint i = 0; i < table_count; i++) {
|
|
ha_rows tmp = 0;
|
|
qt = qep_tab + i;
|
|
|
|
if (qt->type() == JT_ALL || (qt->index() == qt->table()->s->primary_key &&
|
|
qt->table()->file->primary_key_is_clustered()))
|
|
*error = qt->table()->file->ha_records(&tmp);
|
|
else
|
|
*error = qt->table()->file->ha_records(&tmp, qt->index());
|
|
if (*error != 0) {
|
|
(void)report_handler_error(qt->table(), *error);
|
|
return 0;
|
|
}
|
|
count *= tmp;
|
|
}
|
|
*error = 0;
|
|
return count;
|
|
}
|
|
|
|
enum_nested_loop_state end_send_count(JOIN *join, QEP_TAB *qep_tab) {
|
|
List_iterator_fast<Item> it(join->all_fields);
|
|
Item *item;
|
|
int error = 0;
|
|
THD *thd = join->thd;
|
|
|
|
while ((item = it++)) {
|
|
if (item->type() == Item::SUM_FUNC_ITEM &&
|
|
(((Item_sum *)item))->sum_func() == Item_sum::COUNT_FUNC) {
|
|
ulonglong count =
|
|
get_exact_record_count(qep_tab, join->primary_tables, &error);
|
|
if (error) return NESTED_LOOP_ERROR;
|
|
|
|
((Item_sum_count *)item)->make_const((longlong)count);
|
|
}
|
|
}
|
|
|
|
/*
|
|
Copy non-aggregated items in the result set.
|
|
Handles queries like:
|
|
SET @s =1;
|
|
SELECT @s, COUNT(*) FROM t1;
|
|
*/
|
|
if (copy_fields(&join->tmp_table_param, thd)) return NESTED_LOOP_ERROR;
|
|
|
|
if (having_is_true(join->having_cond) && join->should_send_current_row()) {
|
|
if (join->select_lex->query_result()->send_data(thd, *join->fields))
|
|
return NESTED_LOOP_ERROR;
|
|
join->send_records++;
|
|
}
|
|
|
|
return NESTED_LOOP_OK;
|
|
}
|
|
|
|
/* ARGSUSED */
|
|
enum_nested_loop_state end_send_group(JOIN *join, QEP_TAB *qep_tab,
|
|
bool end_of_records) {
|
|
int idx = -1;
|
|
enum_nested_loop_state ok_code = NESTED_LOOP_OK;
|
|
DBUG_TRACE;
|
|
THD *thd = join->thd;
|
|
|
|
List<Item> *fields;
|
|
if (qep_tab) {
|
|
DBUG_ASSERT(qep_tab - 1 == join->ref_slice_immediately_before_group_by);
|
|
fields = &join->tmp_fields_list[REF_SLICE_ORDERED_GROUP_BY];
|
|
} else
|
|
fields = join->fields;
|
|
|
|
/*
|
|
(1) Haven't seen a first row yet
|
|
(2) Have seen all rows
|
|
(3) GROUP expression are different from previous row's
|
|
*/
|
|
if (!join->seen_first_record || // (1)
|
|
end_of_records || // (2)
|
|
(idx = update_item_cache_if_changed(join->group_fields)) >= 0) // (3)
|
|
{
|
|
if (!join->group_sent &&
|
|
(join->seen_first_record ||
|
|
(end_of_records && !join->grouped && !join->group_optimized_away))) {
|
|
if (idx < (int)join->send_group_parts) {
|
|
/*
|
|
As GROUP expressions have changed, we now send forward the group
|
|
of the previous row.
|
|
While end_write_group() has a real tmp table as output,
|
|
end_send_group() has a pseudo-table, made of a list of Item_copy
|
|
items (created by setup_copy_fields()) which are accessible through
|
|
REF_SLICE_ORDERED_GROUP_BY. This is equivalent to one row where the
|
|
current group is accumulated. The creation of a new group in the
|
|
pseudo-table happens in this function (call to
|
|
init_sum_functions()); the update of an existing group also happens
|
|
in this function (call to update_sum_func()); the reading of an
|
|
existing group happens right below.
|
|
As we are now reading from pseudo-table REF_SLICE_ORDERED_GROUP_BY, we
|
|
switch to this slice; we should not have switched when calculating
|
|
group expressions in update_item_cache_if_changed() above; indeed
|
|
these group expressions need the current row of the input table, not
|
|
what is in this slice (which is generally the last completed group so
|
|
is based on some previous row of the input table).
|
|
*/
|
|
Switch_ref_item_slice slice_switch(join, REF_SLICE_ORDERED_GROUP_BY);
|
|
DBUG_ASSERT(fields == join->get_current_fields());
|
|
int error = 0;
|
|
{
|
|
table_map save_nullinfo = 0;
|
|
if (!join->seen_first_record) {
|
|
// Calculate aggregate functions for no rows
|
|
for (Item &item : *fields) {
|
|
item.no_rows_in_result();
|
|
}
|
|
|
|
/*
|
|
Mark tables as containing only NULL values for processing
|
|
the HAVING clause and for send_data().
|
|
Calculate a set of tables for which NULL values need to
|
|
be restored after sending data.
|
|
*/
|
|
if (join->clear_fields(&save_nullinfo))
|
|
return NESTED_LOOP_ERROR; /* purecov: inspected */
|
|
}
|
|
if (!having_is_true(join->having_cond))
|
|
error = -1; // Didn't satisfy having
|
|
else {
|
|
if (join->should_send_current_row())
|
|
error = join->select_lex->query_result()->send_data(thd, *fields);
|
|
join->send_records++;
|
|
thd->get_stmt_da()->inc_current_row_for_condition();
|
|
join->group_sent = true;
|
|
}
|
|
if (join->rollup.state != ROLLUP::STATE_NONE && error <= 0) {
|
|
if (join->rollup_send_data((uint)(idx + 1))) error = 1;
|
|
}
|
|
// Restore NULL values if needed.
|
|
if (save_nullinfo) join->restore_fields(save_nullinfo);
|
|
}
|
|
if (error > 0) return NESTED_LOOP_ERROR; /* purecov: inspected */
|
|
if (end_of_records) return NESTED_LOOP_OK;
|
|
if (join->send_records >= join->unit->select_limit_cnt &&
|
|
join->do_send_rows) {
|
|
if (!join->calc_found_rows)
|
|
return NESTED_LOOP_QUERY_LIMIT; // Abort nicely
|
|
join->do_send_rows = 0;
|
|
join->unit->select_limit_cnt = HA_POS_ERROR;
|
|
} else if (join->send_records >= join->fetch_limit) {
|
|
/*
|
|
There is a server side cursor and all rows
|
|
for this fetch request are sent.
|
|
*/
|
|
/*
|
|
Preventing code duplication. When finished with the group reset
|
|
the group functions and copy_fields. We fall through. bug #11904
|
|
*/
|
|
ok_code = NESTED_LOOP_CURSOR_LIMIT;
|
|
}
|
|
}
|
|
} else {
|
|
if (end_of_records) return NESTED_LOOP_OK;
|
|
join->seen_first_record = true;
|
|
// Initialize the cache of GROUP expressions with this 1st row's values
|
|
(void)(update_item_cache_if_changed(join->group_fields));
|
|
}
|
|
if (idx < (int)join->send_group_parts) {
|
|
/*
|
|
This branch is executed also for cursors which have finished their
|
|
fetch limit - the reason for ok_code.
|
|
|
|
As GROUP expressions have changed, initialize the new group:
|
|
(1) copy non-aggregated expressions (they're constant over the group)
|
|
(2) and reset group aggregate functions.
|
|
|
|
About (1): some expressions to copy are not Item_fields and they are
|
|
copied by copy_fields() which evaluates them (see
|
|
param->grouped_expressions, set up in setup_copy_fields()). Thus,
|
|
copy_fields() can evaluate functions. One of them, F2, may reference
|
|
another one F1, example: SELECT expr AS F1 ... GROUP BY ... HAVING
|
|
F2(F1)<=2 . Assume F1 and F2 are not aggregate functions. Then they are
|
|
calculated by copy_fields() when starting a new group, i.e. here. As F2
|
|
uses an alias to F1, F1 is calculated first; F2 must use that value (not
|
|
evaluate expr again, as expr may not be deterministic), so F2 uses a
|
|
reference (Item_ref) to the already-computed value of F1; that value is
|
|
in Item_copy part of REF_SLICE_ORDERED_GROUP_BY. So, we switch to that
|
|
slice.
|
|
*/
|
|
Switch_ref_item_slice slice_switch(join, REF_SLICE_ORDERED_GROUP_BY);
|
|
if (copy_fields(&join->tmp_table_param, thd)) // (1)
|
|
return NESTED_LOOP_ERROR;
|
|
if (init_sum_functions(join->sum_funcs,
|
|
join->sum_funcs_end[idx + 1])) //(2)
|
|
return NESTED_LOOP_ERROR;
|
|
join->group_sent = false;
|
|
return ok_code;
|
|
}
|
|
}
|
|
if (update_sum_func(join->sum_funcs)) return NESTED_LOOP_ERROR;
|
|
return NESTED_LOOP_OK;
|
|
}
|
|
|
|
static bool cmp_field_value(Field *field, ptrdiff_t diff) {
|
|
DBUG_ASSERT(field);
|
|
/*
|
|
Records are different when:
|
|
1) NULL flags aren't the same
|
|
2) length isn't the same
|
|
3) data isn't the same
|
|
*/
|
|
const bool value1_isnull = field->is_real_null();
|
|
const bool value2_isnull = field->is_real_null(diff);
|
|
|
|
if (value1_isnull != value2_isnull) // 1
|
|
return true;
|
|
if (value1_isnull) return false; // Both values are null, no need to proceed.
|
|
|
|
const size_t value1_length = field->data_length();
|
|
const size_t value2_length = field->data_length(diff);
|
|
|
|
if (field->type() == MYSQL_TYPE_JSON) {
|
|
Field_json *json_field = down_cast<Field_json *>(field);
|
|
|
|
// Fetch the JSON value on the left side of the comparison.
|
|
Json_wrapper left_wrapper;
|
|
if (json_field->val_json(&left_wrapper))
|
|
return true; /* purecov: inspected */
|
|
|
|
// Fetch the JSON value on the right side of the comparison.
|
|
Json_wrapper right_wrapper;
|
|
json_field->ptr += diff;
|
|
bool err = json_field->val_json(&right_wrapper);
|
|
json_field->ptr -= diff;
|
|
if (err) return true; /* purecov: inspected */
|
|
|
|
return (left_wrapper.compare(right_wrapper) != 0);
|
|
}
|
|
|
|
// Trailing space can't be skipped and length is different
|
|
if (!field->is_text_key_type() && value1_length != value2_length) // 2
|
|
return true;
|
|
|
|
if (field->cmp_max(field->ptr, field->ptr + diff, // 3
|
|
std::max(value1_length, value2_length)))
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
Compare GROUP BY in from tmp table's record[0] and record[1]
|
|
|
|
@returns
|
|
true records are different
|
|
false records are the same
|
|
*/
|
|
|
|
static bool group_rec_cmp(ORDER *group, uchar *rec0, uchar *rec1) {
|
|
DBUG_TRACE;
|
|
ptrdiff_t diff = rec1 - rec0;
|
|
|
|
for (ORDER *grp = group; grp; grp = grp->next) {
|
|
Field *field = grp->field_in_tmp_table;
|
|
if (cmp_field_value(field, diff)) return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
Compare GROUP BY in from tmp table's record[0] and record[1]
|
|
|
|
@returns
|
|
true records are different
|
|
false records are the same
|
|
*/
|
|
|
|
static bool table_rec_cmp(TABLE *table) {
|
|
DBUG_TRACE;
|
|
ptrdiff_t diff = table->record[1] - table->record[0];
|
|
Field **fields = table->visible_field_ptr();
|
|
|
|
for (uint i = 0; i < table->visible_field_count(); i++) {
|
|
Field *field = fields[i];
|
|
if (cmp_field_value(field, diff)) return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
Generate hash for a field
|
|
|
|
@returns generated hash
|
|
*/
|
|
|
|
ulonglong unique_hash(Field *field, ulonglong *hash_val) {
|
|
const uchar *pos, *end;
|
|
uint64 seed1 = 0, seed2 = 4;
|
|
ulonglong crc = *hash_val;
|
|
|
|
if (field->is_null()) {
|
|
/*
|
|
Change crc in a way different from an empty string or 0.
|
|
(This is an optimisation; The code will work even if
|
|
this isn't done)
|
|
*/
|
|
crc = ((crc << 8) + 511 + (crc >> (8 * sizeof(ha_checksum) - 8)));
|
|
goto finish;
|
|
}
|
|
|
|
pos = field->get_ptr();
|
|
end = pos + field->data_length();
|
|
|
|
if (field->type() == MYSQL_TYPE_JSON) {
|
|
Field_json *json_field = down_cast<Field_json *>(field);
|
|
|
|
crc = json_field->make_hash_key(*hash_val);
|
|
} else if (field->key_type() == HA_KEYTYPE_TEXT ||
|
|
field->key_type() == HA_KEYTYPE_VARTEXT1 ||
|
|
field->key_type() == HA_KEYTYPE_VARTEXT2) {
|
|
field->charset()->coll->hash_sort(field->charset(), (const uchar *)pos,
|
|
field->data_length(), &seed1, &seed2);
|
|
crc ^= seed1;
|
|
} else
|
|
while (pos != end)
|
|
crc = ((crc << 8) + (*pos++)) + (crc >> (8 * sizeof(ha_checksum) - 8));
|
|
finish:
|
|
*hash_val = crc;
|
|
return crc;
|
|
}
|
|
|
|
/**
|
|
Generate hash for unique constraint according to group-by list.
|
|
|
|
This reads the values of the GROUP BY expressions from fields so assumes
|
|
those expressions have been computed and stored into fields of a temporary
|
|
table; in practice this means that copy_fields() and copy_funcs() must have
|
|
been called.
|
|
*/
|
|
|
|
static ulonglong unique_hash_group(ORDER *group) {
|
|
DBUG_TRACE;
|
|
ulonglong crc = 0;
|
|
|
|
for (ORDER *ord = group; ord; ord = ord->next) {
|
|
Field *field = ord->field_in_tmp_table;
|
|
DBUG_ASSERT(field);
|
|
unique_hash(field, &crc);
|
|
}
|
|
|
|
return crc;
|
|
}
|
|
|
|
/* Generate hash for unique_constraint for all visible fields of a table */
|
|
|
|
static ulonglong unique_hash_fields(TABLE *table) {
|
|
ulonglong crc = 0;
|
|
Field **fields = table->visible_field_ptr();
|
|
|
|
for (uint i = 0; i < table->visible_field_count(); i++)
|
|
unique_hash(fields[i], &crc);
|
|
|
|
return crc;
|
|
}
|
|
|
|
/**
|
|
Check unique_constraint.
|
|
|
|
@details Calculates record's hash and checks whether the record given in
|
|
table->record[0] is already present in the tmp table.
|
|
|
|
@param table JOIN_TAB of tmp table to check
|
|
|
|
@note This function assumes record[0] is already filled by the caller.
|
|
Depending on presence of table->group, it's or full list of table's fields
|
|
are used to calculate hash.
|
|
|
|
@returns
|
|
false same record was found
|
|
true record wasn't found
|
|
*/
|
|
|
|
bool check_unique_constraint(TABLE *table) {
|
|
ulonglong hash;
|
|
|
|
if (!table->hash_field) return true;
|
|
|
|
if (table->no_keyread) return true;
|
|
|
|
if (table->group)
|
|
hash = unique_hash_group(table->group);
|
|
else
|
|
hash = unique_hash_fields(table);
|
|
table->hash_field->store(hash, true);
|
|
int res =
|
|
table->file->ha_index_read_map(table->record[1], table->hash_field->ptr,
|
|
HA_WHOLE_KEY, HA_READ_KEY_EXACT);
|
|
while (!res) {
|
|
// Check whether records are the same.
|
|
if (!(table->group
|
|
? group_rec_cmp(table->group, table->record[0], table->record[1])
|
|
: table_rec_cmp(table)))
|
|
return false; // skip it
|
|
res = table->file->ha_index_next_same(table->record[1],
|
|
table->hash_field->ptr, sizeof(hash));
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
Minion for reset_framing_wf_states and reset_non_framing_wf_state, q.v.
|
|
|
|
@param func_ptr the set of functions
|
|
@param framing true if we want to reset for framing window functions
|
|
*/
|
|
static inline void reset_wf_states(Func_ptr_array *func_ptr, bool framing) {
|
|
for (auto it : *func_ptr) {
|
|
(void)it.func()->walk(&Item::reset_wf_state, enum_walk::POSTFIX,
|
|
(uchar *)&framing);
|
|
}
|
|
}
|
|
/**
|
|
Walk the function calls and reset any framing window function's window state.
|
|
|
|
@param func_ptr an array of function call items which might represent
|
|
or contain window function calls
|
|
*/
|
|
static inline void reset_framing_wf_states(Func_ptr_array *func_ptr) {
|
|
reset_wf_states(func_ptr, true);
|
|
}
|
|
|
|
/**
|
|
Walk the function calls and reset any non-framing window function's window
|
|
state.
|
|
|
|
@param func_ptr an array of function call items which might represent
|
|
or contain window function calls
|
|
*/
|
|
static inline void reset_non_framing_wf_state(Func_ptr_array *func_ptr) {
|
|
reset_wf_states(func_ptr, false);
|
|
}
|
|
|
|
/**
|
|
Dirty trick to be able to copy fields *back* from the frame buffer tmp table
|
|
to the input table's buffer, cf. #bring_back_frame_row.
|
|
|
|
@param param represents the frame buffer tmp file
|
|
*/
|
|
static void swap_copy_field_direction(Temp_table_param *param) {
|
|
for (Copy_field ©_field : param->copy_fields) copy_field.swap_direction();
|
|
}
|
|
|
|
/**
|
|
Save a window frame buffer to frame buffer temporary table.
|
|
|
|
@param thd The current thread
|
|
@param w The current window
|
|
@param rowno The rowno in the current partition (1-based)
|
|
*/
|
|
static bool buffer_record_somewhere(THD *thd, Window *w, int64 rowno) {
|
|
DBUG_TRACE;
|
|
TABLE *const t = w->frame_buffer();
|
|
uchar *record = t->record[0];
|
|
|
|
DBUG_ASSERT(rowno != Window::FBC_FIRST_IN_NEXT_PARTITION);
|
|
DBUG_ASSERT(t->is_created());
|
|
|
|
if (!t->file->inited) {
|
|
/*
|
|
On the frame buffer table, t->file, we do several things in the
|
|
windowing code:
|
|
- read a row by position,
|
|
- read rows after that row,
|
|
- write a row,
|
|
- find the position of a just-written row, if it's first in partition.
|
|
To prepare for reads, we initialize a scan once for all with
|
|
ha_rnd_init(), with argument=true as we'll use ha_rnd_next().
|
|
To read a row, we use ha_rnd_pos() or ha_rnd_next().
|
|
To write, we use ha_write_row().
|
|
To find the position of a just-written row, we are in the following
|
|
conditions:
|
|
- the written row is first of its partition
|
|
- before writing it, we have processed the previous partition, and that
|
|
process ended with a read of the previous partition's last row
|
|
- so, before the write, the read cursor is already positioned on that
|
|
last row.
|
|
Then we do the write; the new row goes after the last row; then
|
|
ha_rnd_next() reads the row after the last row, i.e. reads the written
|
|
row. Then position() gives the position of the written row.
|
|
*/
|
|
int rc = t->file->ha_rnd_init(true);
|
|
if (rc != 0) {
|
|
t->file->print_error(rc, MYF(0));
|
|
return true;
|
|
}
|
|
}
|
|
|
|
int error = t->file->ha_write_row(record);
|
|
w->set_frame_buffer_total_rows(w->frame_buffer_total_rows() + 1);
|
|
|
|
if (error) {
|
|
/* If this is a duplicate error, return immediately */
|
|
if (t->file->is_ignorable_error(error)) return 1;
|
|
|
|
/* Other error than duplicate error: Attempt to create a temporary table. */
|
|
bool is_duplicate;
|
|
if (create_ondisk_from_heap(thd, t, error, true, &is_duplicate)) return -1;
|
|
|
|
DBUG_ASSERT(t->s->db_type() == innodb_hton);
|
|
if (t->file->ha_rnd_init(true)) return true; /* purecov: inspected */
|
|
|
|
/*
|
|
Reset all hints since they all pertain to the in-memory file, not the
|
|
new on-disk one.
|
|
*/
|
|
for (uint i = Window::REA_FIRST_IN_PARTITION;
|
|
i < Window::FRAME_BUFFER_POSITIONS_CARD +
|
|
w->opt_nth_row().m_offsets.size() +
|
|
w->opt_lead_lag().m_offsets.size();
|
|
i++) {
|
|
void *r = (*THR_MALLOC)->Alloc(t->file->ref_length);
|
|
if (r == nullptr) return true;
|
|
w->m_frame_buffer_positions[i].m_position = static_cast<uchar *>(r);
|
|
w->m_frame_buffer_positions[i].m_rowno = -1;
|
|
}
|
|
|
|
if ((w->m_tmp_pos.m_position =
|
|
(uchar *)(*THR_MALLOC)->Alloc(t->file->ref_length)) == nullptr)
|
|
return true;
|
|
|
|
w->m_frame_buffer_positions[Window::REA_FIRST_IN_PARTITION].m_rowno = 1;
|
|
/*
|
|
The auto-generated primary key of the first row is 1. Our offset is
|
|
also one-based, so we can use w->frame_buffer_partition_offset() "as is"
|
|
to construct the position.
|
|
*/
|
|
encode_innodb_position(
|
|
w->m_frame_buffer_positions[Window::REA_FIRST_IN_PARTITION].m_position,
|
|
t->file->ref_length, w->frame_buffer_partition_offset());
|
|
|
|
return is_duplicate ? 1 : 0;
|
|
}
|
|
|
|
/* Save position in frame buffer file of first row in a partition */
|
|
if (rowno == 1) {
|
|
if (w->m_frame_buffer_positions.empty()) {
|
|
w->m_frame_buffer_positions.init(thd->mem_root);
|
|
/* lazy initialization of positions remembered */
|
|
for (uint i = 0; i < Window::FRAME_BUFFER_POSITIONS_CARD +
|
|
w->opt_nth_row().m_offsets.size() +
|
|
w->opt_lead_lag().m_offsets.size();
|
|
i++) {
|
|
void *r = (*THR_MALLOC)->Alloc(t->file->ref_length);
|
|
if (r == nullptr) return true;
|
|
Window::Frame_buffer_position p(static_cast<uchar *>(r), -1);
|
|
w->m_frame_buffer_positions.push_back(p);
|
|
}
|
|
|
|
if ((w->m_tmp_pos.m_position =
|
|
(uchar *)(*THR_MALLOC)->Alloc(t->file->ref_length)) == nullptr)
|
|
return true;
|
|
}
|
|
|
|
// Do a read to establish scan position, then get it
|
|
error = t->file->ha_rnd_next(record);
|
|
t->file->position(record);
|
|
std::memcpy(
|
|
w->m_frame_buffer_positions[Window::REA_FIRST_IN_PARTITION].m_position,
|
|
t->file->ref, t->file->ref_length);
|
|
w->m_frame_buffer_positions[Window::REA_FIRST_IN_PARTITION].m_rowno = 1;
|
|
w->set_frame_buffer_partition_offset(w->frame_buffer_total_rows());
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
If we cannot evaluate all window functions for a window on the fly, buffer the
|
|
current row for later processing by process_buffered_windowing_record.
|
|
|
|
@param thd Current thread
|
|
@param param The temporary table parameter
|
|
|
|
@param[in,out] new_partition If input is not nullptr:
|
|
sets the bool pointed to to true if a new partition
|
|
was found and there was a previous partition; if
|
|
so the buffering of the first row in new
|
|
partition isn't done and must be repeated
|
|
later: we save away the row as rowno
|
|
FBC_FIRST_IN_NEXT_PARTITION, then fetch it back
|
|
later, cf. end_write_wf.
|
|
If input is nullptr, this is the "later" call to
|
|
buffer the first row of the new partition:
|
|
buffer the row.
|
|
@return true if error.
|
|
*/
|
|
bool buffer_windowing_record(THD *thd, Temp_table_param *param,
|
|
bool *new_partition) {
|
|
DBUG_TRACE;
|
|
Window *w = param->m_window;
|
|
|
|
if (copy_fields(w->frame_buffer_param(), thd)) return true;
|
|
|
|
if (new_partition != nullptr) {
|
|
const bool first_partition = w->partition_rowno() == 0;
|
|
w->check_partition_boundary();
|
|
|
|
if (!first_partition && w->partition_rowno() == 1) {
|
|
*new_partition = true;
|
|
w->save_special_record(Window::FBC_FIRST_IN_NEXT_PARTITION,
|
|
w->frame_buffer());
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/*
|
|
The record is now ready in TABLE and can be saved. The window
|
|
function(s) on the window have not yet been evaluated, but
|
|
will be evaluated when we read frame rows back, before the end wf result
|
|
(usually ready in the last read when the last frame row has been read back)
|
|
can be produced. E.g. SUM(i): we save away all rows in partition.
|
|
We read back rows in current row's frame, producing the total SUM in the
|
|
last read back row. That value for SUM will then be used for the current row
|
|
output.
|
|
*/
|
|
|
|
if (w->needs_restore_input_row()) {
|
|
w->save_special_record(Window::FBC_LAST_BUFFERED_ROW, w->frame_buffer());
|
|
}
|
|
|
|
if (buffer_record_somewhere(thd, w, w->partition_rowno())) return true;
|
|
|
|
w->set_last_rowno_in_cache(w->partition_rowno());
|
|
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
Read row rowno from frame buffer tmp file using cached row positions to
|
|
minimize positioning work.
|
|
*/
|
|
static bool read_frame_buffer_row(int64 rowno, Window *w,
|
|
#ifndef DBUG_OFF
|
|
bool for_nth_value)
|
|
#else
|
|
bool for_nth_value MY_ATTRIBUTE((unused)))
|
|
#endif
|
|
{
|
|
int use_idx = 0; // closest prior position found, a priori 0 (row 1)
|
|
int diff = w->last_rowno_in_cache(); // maximum a priori
|
|
TABLE *t = w->frame_buffer();
|
|
|
|
// Find the saved position closest to where we want to go
|
|
for (int i = w->m_frame_buffer_positions.size() - 1; i >= 0; i--) {
|
|
auto cand = w->m_frame_buffer_positions[i];
|
|
if (cand.m_rowno == -1 || cand.m_rowno > rowno) continue;
|
|
|
|
if (rowno - cand.m_rowno < diff) {
|
|
/* closest so far */
|
|
diff = rowno - cand.m_rowno;
|
|
use_idx = i;
|
|
}
|
|
}
|
|
|
|
auto cand = &w->m_frame_buffer_positions[use_idx];
|
|
|
|
int error =
|
|
t->file->ha_rnd_pos(w->frame_buffer()->record[0], cand->m_position);
|
|
if (error) {
|
|
t->file->print_error(error, MYF(0));
|
|
return true;
|
|
}
|
|
|
|
if (rowno > cand->m_rowno) {
|
|
/*
|
|
The saved position didn't correspond exactly to where we want to go, but
|
|
is located one or more rows further out on the file, so read next to move
|
|
forward to desired row.
|
|
*/
|
|
const int64 cnt = rowno - cand->m_rowno;
|
|
|
|
/*
|
|
We should have enough location hints to normally need only one extra read.
|
|
If we have just switched to INNODB due to MEM overflow, a rescan is
|
|
required, so skip assert if we have INNODB.
|
|
*/
|
|
DBUG_ASSERT(w->frame_buffer()->s->db_type()->db_type == DB_TYPE_INNODB ||
|
|
cnt <= 1 ||
|
|
// unless we have a frame beyond the current row, 1. time
|
|
// in which case we need to do some scanning...
|
|
(w->last_row_output() == 0 &&
|
|
w->frame()->m_from->m_border_type == WBT_VALUE_FOLLOWING) ||
|
|
// or unless we are search for NTH_VALUE, which can be in the
|
|
// middle of a frame, and with RANGE frames it can jump many
|
|
// positions from one frame to the next with optimized eval
|
|
// strategy
|
|
for_nth_value);
|
|
|
|
for (int i = 0; i < cnt; i++) {
|
|
error = t->file->ha_rnd_next(t->record[0]);
|
|
if (error) {
|
|
t->file->print_error(error, MYF(0));
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
#if !defined(DBUG_OFF)
|
|
inline static void dbug_allow_write_all_columns(
|
|
Temp_table_param *param, std::map<TABLE *, my_bitmap_map *> &map) {
|
|
for (auto ©_field : param->copy_fields) {
|
|
TABLE *const t = copy_field.from_field()->table;
|
|
if (t != nullptr) {
|
|
auto it = map.find(t);
|
|
if (it == map.end())
|
|
map.insert(it, std::pair<TABLE *, my_bitmap_map *>(
|
|
t, dbug_tmp_use_all_columns(t, t->write_set)));
|
|
}
|
|
}
|
|
}
|
|
|
|
inline static void dbug_restore_all_columns(
|
|
std::map<TABLE *, my_bitmap_map *> &map) {
|
|
auto func = [](std::pair<TABLE *const, my_bitmap_map *> &e) {
|
|
dbug_tmp_restore_column_map(e.first->write_set, e.second);
|
|
};
|
|
|
|
std::for_each(map.begin(), map.end(), func);
|
|
}
|
|
#endif
|
|
|
|
/**
|
|
Bring back buffered data to the record of qep_tab-1 [1], and optionally
|
|
execute copy_fields() to the OUT table.
|
|
|
|
[1] This is not always the case. For the first window, if we have no
|
|
PARTITION BY or ORDER BY in the window, and there is more than one table
|
|
in the join, the logical input can consist of more than one table
|
|
(qep_tab-1 .. qep_tab-n), so the record accordingly.
|
|
|
|
This method works by temporarily reversing the "normal" direction of the field
|
|
copying.
|
|
|
|
Also make a note of the position of the record we retrieved in the window's
|
|
m_frame_buffer_positions to be able to optimize succeeding retrievals.
|
|
|
|
@param thd The current thread
|
|
@param w The current window
|
|
@param out_param OUT table; if not nullptr, does copy_fields() to OUT
|
|
@param rowno The row number (in the partition) to set up
|
|
@param reason What kind of row to retrieve
|
|
@param fno Used with NTH_VALUE and LEAD/LAG to specify which
|
|
window function's position cache to use, i.e. what index
|
|
of m_frame_buffer_positions to update. For the second
|
|
LEAD/LAG window function in a query, the index would be
|
|
REA_MISC_POSITIONS (reason) + \<no of NTH functions\> + 2.
|
|
|
|
@return true on error
|
|
*/
|
|
bool bring_back_frame_row(THD *thd, Window &w, Temp_table_param *out_param,
|
|
int64 rowno,
|
|
enum Window::retrieve_cached_row_reason reason,
|
|
int fno) {
|
|
DBUG_TRACE;
|
|
DBUG_PRINT("enter",
|
|
("rowno: %" PRId64 " reason: %d fno: %d", rowno, reason, fno));
|
|
DBUG_ASSERT(reason == Window::REA_MISC_POSITIONS || fno == 0);
|
|
|
|
uchar *fb_rec = w.frame_buffer()->record[0];
|
|
|
|
DBUG_ASSERT(rowno != 0);
|
|
|
|
/*
|
|
If requested row is the last we fetched from FB and copied to OUT, we
|
|
don't need to fetch and copy again.
|
|
Because "reason", "fno" may differ from the last call which fetched the
|
|
row, we still do the updates of w.m_frame_buffer_positions even if
|
|
do_fetch=false.
|
|
*/
|
|
bool do_fetch;
|
|
|
|
if (rowno == Window::FBC_FIRST_IN_NEXT_PARTITION) {
|
|
do_fetch = true;
|
|
w.restore_special_record(rowno, fb_rec);
|
|
} else if (rowno == Window::FBC_LAST_BUFFERED_ROW) {
|
|
do_fetch = w.row_has_fields_in_out_table() != w.last_rowno_in_cache();
|
|
if (do_fetch) w.restore_special_record(rowno, fb_rec);
|
|
} else {
|
|
DBUG_ASSERT(reason != Window::REA_WONT_UPDATE_HINT);
|
|
do_fetch = w.row_has_fields_in_out_table() != rowno;
|
|
|
|
if (do_fetch &&
|
|
read_frame_buffer_row(rowno, &w, reason == Window::REA_MISC_POSITIONS))
|
|
return true;
|
|
|
|
/* Got row rowno in record[0], remember position */
|
|
const TABLE *const t = w.frame_buffer();
|
|
t->file->position(fb_rec);
|
|
std::memcpy(w.m_frame_buffer_positions[reason + fno].m_position,
|
|
t->file->ref, t->file->ref_length);
|
|
w.m_frame_buffer_positions[reason + fno].m_rowno = rowno;
|
|
}
|
|
|
|
if (!do_fetch) return false;
|
|
|
|
Temp_table_param *const fb_info = w.frame_buffer_param();
|
|
|
|
#if !defined(DBUG_OFF)
|
|
/*
|
|
Since we are copying back a row from the frame buffer to the input table's
|
|
buffer, we will be copying into fields that are not necessarily marked as
|
|
writeable. To eliminate problems with ASSERT_COLUMN_MARKED_FOR_WRITE, we
|
|
set all fields writeable. This is only
|
|
applicable in debug builds, since ASSERT_COLUMN_MARKED_FOR_WRITE is debug
|
|
only.
|
|
*/
|
|
std::map<TABLE *, my_bitmap_map *> saved_map;
|
|
dbug_allow_write_all_columns(fb_info, saved_map);
|
|
#endif
|
|
|
|
/*
|
|
Do the inverse of copy_fields to get the row's fields back to the input
|
|
table from the frame buffer.
|
|
*/
|
|
swap_copy_field_direction(fb_info);
|
|
|
|
bool rc = copy_fields(fb_info, thd);
|
|
|
|
swap_copy_field_direction(fb_info); // reset original direction
|
|
|
|
#if !defined(DBUG_OFF)
|
|
dbug_restore_all_columns(saved_map);
|
|
#endif
|
|
|
|
if (!rc) {
|
|
if (out_param) {
|
|
if (copy_fields(out_param, thd)) return true;
|
|
// fields are in IN and in OUT
|
|
if (rowno >= 1) w.set_row_has_fields_in_out_table(rowno);
|
|
} else
|
|
// we only wrote IN record, so OUT and IN are inconsistent
|
|
w.set_row_has_fields_in_out_table(0);
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
/**
|
|
Save row special_rowno in table t->record[0] to an in-memory copy for later
|
|
restoration.
|
|
*/
|
|
void Window::save_special_record(uint64 special_rowno, TABLE *t) {
|
|
DBUG_PRINT("info", ("save_special_record: %" PRIu64, special_rowno));
|
|
size_t l = t->s->reclength;
|
|
DBUG_ASSERT(m_special_rows_cache_max_length >= l); // check room.
|
|
// From negative enum, get proper array index:
|
|
int idx = FBC_FIRST_KEY - special_rowno;
|
|
m_special_rows_cache_length[idx] = l;
|
|
std::memcpy(m_special_rows_cache + idx * m_special_rows_cache_max_length,
|
|
t->record[0], l);
|
|
}
|
|
|
|
/**
|
|
Restore row special_rowno into record from in-memory copy. Any fields not
|
|
the result of window functions are not used, but they do tag along here
|
|
(unnecessary copying..). BLOBs: have storage in result_field of Item
|
|
for the window function although the pointer is copied here. The
|
|
result field storage is stable across reads from the frame buffer, so safe.
|
|
*/
|
|
void Window::restore_special_record(uint64 special_rowno, uchar *record) {
|
|
DBUG_PRINT("info", ("restore_special_record: %" PRIu64, special_rowno));
|
|
int idx = FBC_FIRST_KEY - special_rowno;
|
|
size_t l = m_special_rows_cache_length[idx];
|
|
std::memcpy(record,
|
|
m_special_rows_cache + idx * m_special_rows_cache_max_length, l);
|
|
// Sometimes, "record" points to IN record
|
|
set_row_has_fields_in_out_table(0);
|
|
}
|
|
|
|
/**
|
|
Process window functions that need partition cardinality
|
|
*/
|
|
bool process_wfs_needing_card(
|
|
THD *thd, Temp_table_param *param, const Window::st_nth &have_nth_value,
|
|
const Window::st_lead_lag &have_lead_lag, const int64 current_row,
|
|
Window &w, enum Window::retrieve_cached_row_reason current_row_reason) {
|
|
w.set_rowno_being_visited(current_row);
|
|
|
|
// Reset state for LEAD/LAG functions
|
|
if (!have_lead_lag.m_offsets.empty()) w.reset_lead_lag();
|
|
|
|
// This also handles LEAD(.., 0)
|
|
if (copy_funcs(param, thd, CFT_WF_NEEDS_CARD)) return true;
|
|
|
|
if (!have_lead_lag.m_offsets.empty()) {
|
|
int fno = 0;
|
|
const int nths = have_nth_value.m_offsets.size();
|
|
|
|
for (auto &ll : have_lead_lag.m_offsets) {
|
|
const int64 rowno_to_visit = current_row - ll.m_rowno;
|
|
|
|
if (rowno_to_visit == current_row)
|
|
continue; // Already processed above above
|
|
|
|
/*
|
|
Note that this value can be outside partition, even negative: if so,
|
|
the default will applied, if any is provided.
|
|
*/
|
|
w.set_rowno_being_visited(rowno_to_visit);
|
|
|
|
if (rowno_to_visit >= 1 && rowno_to_visit <= w.last_rowno_in_cache()) {
|
|
if (bring_back_frame_row(thd, w, param, rowno_to_visit,
|
|
Window::REA_MISC_POSITIONS, nths + fno++))
|
|
return true;
|
|
}
|
|
|
|
if (copy_funcs(param, thd, CFT_WF_NEEDS_CARD)) return true;
|
|
}
|
|
/* Bring back the fields for the output row */
|
|
if (bring_back_frame_row(thd, w, param, current_row, current_row_reason))
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
While there are more unprocessed rows ready to process given the current
|
|
partition/frame state, process such buffered rows by evaluating/aggregating
|
|
the window functions defined over this window on the current frame, moving
|
|
the frame if required.
|
|
|
|
This method contains the main execution time logic of the evaluation
|
|
window functions if we need buffering for one or more of the window functions
|
|
defined on the window.
|
|
|
|
Moving (sliding) frames can be executed using a naive or optimized strategy
|
|
for aggregate window functions, like SUM or AVG (but not MAX, or MIN).
|
|
In the naive approach, for each row considered for processing from the buffer,
|
|
we visit all the rows defined in the frame for that row, essentially leading
|
|
to N*M complexity, where N is the number of rows in the result set, and M is
|
|
the number for rows in the frame. This can be slow for large frames,
|
|
obviously, so we can choose an optimized evaluation strategy using inversion.
|
|
This means that when rows leave the frame as we move it forward, we re-use
|
|
the previous aggregate state, but compute the *inverse* function to eliminate
|
|
the contribution to the aggregate by the row(s) leaving the frame, and then
|
|
use the normal aggregate function to add the contribution of the rows moving
|
|
into the frame. The present method contains code paths for both strategies.
|
|
|
|
For integral data types, this is safe in the sense that the result will be the
|
|
same if no overflow occurs during normal evaluation. For floating numbers,
|
|
optimizing in this way may lead to different results, so it is not done by
|
|
default, cf the session variable "windowing_use_high_precision".
|
|
|
|
Since the evaluation strategy is chosen based on the "most difficult" window
|
|
function defined on the window, we must also be able to evaluate
|
|
non-aggregates like ROW_NUMBER, NTILE, FIRST_VALUE in the code path of the
|
|
optimized aggregates, so there is redundant code for those in the naive and
|
|
optimized code paths. Note that NTILE forms a class of its own of the
|
|
non-aggregates: it needs two passes over the partition's rows since the
|
|
cardinality is needed to compute it. Furthermore, FIRST_VALUE and LAST_VALUE
|
|
heed the frames, but they are not aggregates.
|
|
|
|
The is a special optimized code path for *static aggregates*: when the window
|
|
frame is the default, e.g. the entire partition and there is no ORDER BY
|
|
specified, the value of the framing window functions, i.e. SUM, AVG,
|
|
FIRST_VALUE, LAST_VALUE can be evaluated once and for all and saved when
|
|
we visit and evaluate the first row of the partition. For later rows we
|
|
restore the aggregate values and just fill in the other fields and evaluate
|
|
non-framing window functions for the row.
|
|
|
|
The code paths both for naive execution and optimized execution differ
|
|
depending on whether we have ROW or RANGE boundaries in a explicit frame.
|
|
|
|
A word on BLOBs. Below we make copies of rows into the frame buffer.
|
|
This is a temporary table, so BLOBs get copied in the normal way.
|
|
|
|
Sometimes we save records containing already computed framing window
|
|
functions away into memory only: is the lifetime of the referenced BLOBs long
|
|
enough? We have two cases:
|
|
|
|
BLOB results from wfs: Any BLOB results will reside in the copies in result
|
|
fields of the Items ready for the out file, so they no longer need any BLOB
|
|
memory read from the frame buffer tmp file.
|
|
|
|
BLOB fields not evaluated by wfs: Any other BLOB field will be copied as
|
|
well, and would not have life-time past the next read from the frame buffer,
|
|
but they are never used since we fill in the fields from the current row
|
|
after evaluation of the window functions, so we don't need to make special
|
|
copies of such BLOBs. This can be (and was) tested by shredding any BLOBs
|
|
deallocated by InnoDB at the next read.
|
|
|
|
We also save away in memory the next record of the next partition while
|
|
processing the current partition. Any blob there will have its storage from
|
|
the read of the input file, but we won't be touching that for reading again
|
|
until after we start processing the next partition and save the saved away
|
|
next partition row to the frame buffer.
|
|
|
|
Note that the logic of this function is centered around the window, not
|
|
around the window function. It is about putting rows in a partition,
|
|
in a frame, in a set of peers, and passing this information to all window
|
|
functions attached to this window; each function looks at the partition,
|
|
frame, or peer set in its own particular way (for example RANK looks at the
|
|
partition, SUM looks at the frame).
|
|
|
|
@param thd Current thread
|
|
@param param Current temporary table
|
|
@param new_partition_or_eof True if (we are about to start a new partition
|
|
and there was a previous partition) or eof
|
|
@param[out] output_row_ready True if there is a row record ready to write
|
|
to the out table
|
|
|
|
@return true if error
|
|
*/
|
|
bool process_buffered_windowing_record(THD *thd, Temp_table_param *param,
|
|
const bool new_partition_or_eof,
|
|
bool *output_row_ready) {
|
|
DBUG_TRACE;
|
|
/**
|
|
The current window
|
|
*/
|
|
Window &w = *param->m_window;
|
|
|
|
/**
|
|
The frame
|
|
*/
|
|
const PT_frame *f = w.frame();
|
|
|
|
*output_row_ready = false;
|
|
|
|
/**
|
|
This is the row we are currently considering for processing and getting
|
|
ready for output, cf. output_row_ready.
|
|
*/
|
|
const int64 current_row = w.last_row_output() + 1;
|
|
|
|
/**
|
|
This is the row number of the last row we have buffered so far.
|
|
*/
|
|
const int64 last_rowno_in_cache = w.last_rowno_in_cache();
|
|
|
|
if (current_row > last_rowno_in_cache) // already sent all buffered rows
|
|
return false;
|
|
|
|
/**
|
|
If true, use code path for static aggregates
|
|
*/
|
|
const bool static_aggregate = w.static_aggregates();
|
|
|
|
/**
|
|
If true, use code path for ROW bounds with optimized strategy
|
|
*/
|
|
const bool row_optimizable = w.optimizable_row_aggregates();
|
|
|
|
/**
|
|
If true, use code path for RANGE bounds with optimized strategy
|
|
*/
|
|
const bool range_optimizable = w.optimizable_range_aggregates();
|
|
|
|
// These three strategies are mutually exclusive:
|
|
DBUG_ASSERT((static_aggregate + row_optimizable + range_optimizable) <= 1);
|
|
|
|
/**
|
|
We need to evaluate FIRST_VALUE, or optimized MIN/MAX
|
|
*/
|
|
const bool have_first_value = w.opt_first_row();
|
|
|
|
/**
|
|
We need to evaluate LAST_VALUE, or optimized MIN/MAX
|
|
*/
|
|
const bool have_last_value = w.opt_last_row();
|
|
|
|
/**
|
|
We need to evaluate NTH_VALUE
|
|
*/
|
|
const Window::st_nth &have_nth_value = w.opt_nth_row();
|
|
|
|
/**
|
|
We need to evaluate LEAD/LAG rows
|
|
*/
|
|
|
|
const Window::st_lead_lag &have_lead_lag = w.opt_lead_lag();
|
|
|
|
/**
|
|
True if an inversion optimization strategy is used. For common
|
|
code paths.
|
|
*/
|
|
const bool optimizable = (row_optimizable || range_optimizable);
|
|
|
|
/**
|
|
RANGE was specified as the bounds unit for the frame
|
|
*/
|
|
const bool range_frame = f->m_unit == WFU_RANGE;
|
|
|
|
const bool range_to_current_row =
|
|
range_frame && f->m_to->m_border_type == WBT_CURRENT_ROW;
|
|
|
|
const bool range_from_first_to_current_row =
|
|
range_to_current_row &&
|
|
f->m_from->m_border_type == WBT_UNBOUNDED_PRECEDING;
|
|
/**
|
|
UNBOUNDED FOLLOWING was specified for the frame
|
|
*/
|
|
bool unbounded_following = false;
|
|
|
|
/**
|
|
Row_number of the first row in the frame. Invariant: lower_limit >= 1
|
|
after initialization.
|
|
*/
|
|
int64 lower_limit = 1;
|
|
|
|
/**
|
|
Row_number of the logically last row to be computed in the frame, may be
|
|
higher than the number of rows in the partition. The actual highest row
|
|
number is computed later, see upper below.
|
|
*/
|
|
int64 upper_limit = 0;
|
|
|
|
/**
|
|
needs peerset of current row to evaluate a wf for the current row.
|
|
*/
|
|
bool needs_peerset = w.needs_peerset();
|
|
|
|
/**
|
|
needs the last peer of the current row within a frame.
|
|
*/
|
|
const bool needs_last_peer_in_frame = w.needs_last_peer_in_frame();
|
|
|
|
DBUG_PRINT("enter", ("current_row: %" PRId64 ", new_partition_or_eof: %d",
|
|
current_row, new_partition_or_eof));
|
|
|
|
/* Compute lower_limit, upper_limit and possibly unbounded_following */
|
|
if (f->m_unit == WFU_RANGE) {
|
|
lower_limit = w.first_rowno_in_range_frame();
|
|
/*
|
|
For RANGE frame, we first buffer all the rows in the partition due to the
|
|
need to find last peer before first can be processed. This can be
|
|
optimized,
|
|
FIXME.
|
|
*/
|
|
upper_limit = INT64_MAX;
|
|
} else {
|
|
DBUG_ASSERT(f->m_unit == WFU_ROWS);
|
|
bool lower_within_limits = true;
|
|
/* Determine lower border */
|
|
int64 border =
|
|
f->m_from->border() != nullptr ? f->m_from->border()->val_int() : 0;
|
|
switch (f->m_from->m_border_type) {
|
|
case WBT_CURRENT_ROW:
|
|
lower_limit = current_row;
|
|
break;
|
|
case WBT_VALUE_PRECEDING:
|
|
/*
|
|
Example: 1 PRECEDING and current row== 2 => 1
|
|
current row== 1 => 1
|
|
current row== 3 => 2
|
|
*/
|
|
lower_limit = std::max<int64>(current_row - border, 1);
|
|
break;
|
|
case WBT_VALUE_FOLLOWING:
|
|
/*
|
|
Example: 1 FOLLOWING and current row== 2 => 3
|
|
current row== 1 => 2
|
|
current row== 3 => 4
|
|
*/
|
|
if (border <= (std::numeric_limits<int64>::max() - current_row))
|
|
lower_limit = current_row + border;
|
|
else {
|
|
lower_within_limits = false;
|
|
lower_limit = INT64_MAX;
|
|
}
|
|
break;
|
|
case WBT_UNBOUNDED_PRECEDING:
|
|
lower_limit = 1;
|
|
break;
|
|
case WBT_UNBOUNDED_FOLLOWING:
|
|
DBUG_ASSERT(false);
|
|
break;
|
|
}
|
|
|
|
/* Determine upper border */
|
|
border = f->m_to->border() != nullptr ? f->m_to->border()->val_int() : 0;
|
|
{
|
|
switch (f->m_to->m_border_type) {
|
|
case WBT_CURRENT_ROW:
|
|
// we always have enough cache
|
|
upper_limit = current_row;
|
|
break;
|
|
case WBT_VALUE_PRECEDING:
|
|
upper_limit = current_row - border;
|
|
break;
|
|
case WBT_VALUE_FOLLOWING:
|
|
if (border <= (std::numeric_limits<longlong>::max() - current_row))
|
|
upper_limit = current_row + border;
|
|
else {
|
|
upper_limit = INT64_MAX;
|
|
/*
|
|
If both the border specifications are beyond numeric limits,
|
|
the window frame is empty.
|
|
*/
|
|
if (f->m_from->m_border_type == WBT_VALUE_FOLLOWING &&
|
|
!lower_within_limits) {
|
|
lower_limit = INT64_MAX;
|
|
upper_limit = INT64_MAX - 1;
|
|
}
|
|
}
|
|
break;
|
|
case WBT_UNBOUNDED_FOLLOWING:
|
|
unbounded_following = true;
|
|
upper_limit = INT64_MAX; // need whole partition
|
|
break;
|
|
case WBT_UNBOUNDED_PRECEDING:
|
|
DBUG_ASSERT(false);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
Determine if, given our current read and buffering state, we have enough
|
|
buffered rows to compute an output row.
|
|
|
|
Example: ROWS BETWEEN 1 PRECEDING and 3 FOLLOWING
|
|
|
|
State:
|
|
+---+-------------------------------+
|
|
| | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |
|
|
+---+-------------------------------+
|
|
^ 1? ^
|
|
lower last_rowno_in_cache
|
|
(0) (4)
|
|
|
|
This state means:
|
|
|
|
We have read 4 rows, cf. value of last_rowno_in_cache.
|
|
We can now process row 1 since both lower (1-1=0) and upper (1+3=4) are less
|
|
than or equal to 4, the last row in the cache so far.
|
|
|
|
We can not process row 2 since: !(4 >= 2 + 3) and we haven't seen the last
|
|
row in partition which means that the frame may not be full yet.
|
|
|
|
If we have a window function that needs to know the partition cardinality,
|
|
we also must buffer all records of the partition before processing.
|
|
*/
|
|
|
|
if (!((lower_limit <= last_rowno_in_cache &&
|
|
upper_limit <= last_rowno_in_cache &&
|
|
!w.needs_card()) || /* we have cached enough rows */
|
|
new_partition_or_eof /* we have cached all rows */))
|
|
return false; // We haven't read enough rows yet, so return
|
|
|
|
w.set_rowno_in_partition(current_row);
|
|
|
|
/*
|
|
By default, we must:
|
|
- if we are the first row of a partition, reset values for both
|
|
non-framing and framing WFs
|
|
- reset values for framing WFs (new current row = new frame = new
|
|
values for WFs).
|
|
|
|
Both resettings require restoring the row from the FB. And, as we have
|
|
restored this row, we use this opportunity to compute non-framing
|
|
does-not-need-card functions.
|
|
|
|
The meaning of if statements below is that in some cases, we can avoid
|
|
this default behaviour.
|
|
|
|
For example, if we have static framing WFs, and this is not the
|
|
partition's first row: the previous row's framing-WF values should be
|
|
reused without change, so all the above resetting must be skipped;
|
|
so row restoration isn't immediately needed; that and the computation of
|
|
non-framing functions is then done in another later block of code.
|
|
Likewise, if we have framing WFs with inversion, and it's not the
|
|
first row of the partition, we must skip the resetting of framing WFs.
|
|
*/
|
|
if (!static_aggregate || current_row == 1) {
|
|
/*
|
|
We need to reset functions. As part of it, their comparators need to
|
|
update themselves to use the new row as base line. So, restore it:
|
|
*/
|
|
if (bring_back_frame_row(thd, w, param, current_row, Window::REA_CURRENT))
|
|
return true;
|
|
|
|
if (current_row == 1) // new partition
|
|
reset_non_framing_wf_state(param->items_to_copy);
|
|
if (!optimizable || current_row == 1) // new frame
|
|
{
|
|
reset_framing_wf_states(param->items_to_copy);
|
|
} // else we remember state and update it for row 2..N
|
|
|
|
/* E.g. ROW_NUMBER, RANK, DENSE_RANK */
|
|
if (copy_funcs(param, thd, CFT_WF_NON_FRAMING)) return true;
|
|
if (!optimizable || current_row == 1) {
|
|
/*
|
|
So far frame is empty; set up a flag which makes framing WFs set
|
|
themselves to NULL in OUT.
|
|
*/
|
|
w.set_do_copy_null(true);
|
|
if (copy_funcs(param, thd, CFT_WF_FRAMING)) return true;
|
|
w.set_do_copy_null(false);
|
|
} // else aggregates keep value of previous row, and we'll do inversion
|
|
}
|
|
|
|
if (range_frame) {
|
|
/* establish current row as base-line for RANGE computation */
|
|
w.reset_order_by_peer_set();
|
|
}
|
|
|
|
bool first_row_in_range_frame_seen = false;
|
|
|
|
/**
|
|
For optimized strategy we want to save away the previous aggregate result
|
|
and reuse in later round by inversion. This keeps track of whether we
|
|
managed to compute results for this current row (result are "primed"), so we
|
|
can use inversion in later rows. Cf Window::m_aggregates_primed.
|
|
*/
|
|
bool optimizable_primed = false;
|
|
|
|
/**
|
|
Possible adjustment of the logical upper_limit: no rows exist beyond
|
|
last_rowno_in_cache.
|
|
*/
|
|
const int64 upper = min(upper_limit, last_rowno_in_cache);
|
|
|
|
/*
|
|
Optimization: we evaluate the peer set of the current row potentially
|
|
several times. Window functions like CUME_DIST sets needs_peerset and is
|
|
evaluated last, so if any other wf evaluation led to finding the peer set
|
|
of the current row, make a note of it, so we can skip doing it twice.
|
|
*/
|
|
bool have_peers_current_row = false;
|
|
|
|
if ((static_aggregate && current_row == 1) || // skip for row > 1
|
|
(optimizable && !w.aggregates_primed()) || // skip for 2..N in frame
|
|
(!static_aggregate && !optimizable)) // normal: no skip
|
|
{
|
|
// Compute and output current_row.
|
|
int64 rowno; ///< iterates over rows in a frame
|
|
int64 skipped = 0; ///< RANGE: # of visited rows seen before the frame
|
|
|
|
for (rowno = lower_limit; rowno <= upper; rowno++) {
|
|
if (optimizable) optimizable_primed = true;
|
|
|
|
/*
|
|
Set window frame state before computing framing window function.
|
|
'n' is the number of row #rowno relative to the beginning of the
|
|
frame, 1-based.
|
|
*/
|
|
const int64 n = rowno - lower_limit + 1 - skipped;
|
|
|
|
w.set_rowno_in_frame(n);
|
|
w.set_rowno_being_visited(rowno);
|
|
|
|
const Window::retrieve_cached_row_reason reason =
|
|
(n == 1 ? Window::REA_FIRST_IN_FRAME : Window::REA_LAST_IN_FRAME);
|
|
/*
|
|
Hint maintenance: we will normally read past last row in frame, so
|
|
prepare to resurrect that hint once we do.
|
|
*/
|
|
w.save_pos(reason);
|
|
|
|
/* Set up the non-wf fields for aggregating to the output row. */
|
|
if (bring_back_frame_row(thd, w, param, rowno, reason)) return true;
|
|
|
|
if (range_frame) {
|
|
if (w.before_frame()) {
|
|
skipped++;
|
|
continue;
|
|
}
|
|
if (w.after_frame()) {
|
|
w.set_last_rowno_in_range_frame(rowno - 1);
|
|
|
|
if (!first_row_in_range_frame_seen)
|
|
// empty frame, optimize starting point for next row
|
|
w.set_first_rowno_in_range_frame(rowno);
|
|
w.restore_pos(reason);
|
|
break;
|
|
} // else: row is within range, process
|
|
|
|
if (!first_row_in_range_frame_seen) {
|
|
/*
|
|
Optimize starting point for next row: monotonic increase in frame
|
|
bounds
|
|
*/
|
|
first_row_in_range_frame_seen = true;
|
|
w.set_first_rowno_in_range_frame(rowno);
|
|
}
|
|
}
|
|
|
|
/*
|
|
Compute framing WFs. For ROWS frame, "upper" is exactly the frame's
|
|
last row; but for the case of RANGE
|
|
we can't be sure that this is indeed the last row, but we must make a
|
|
pessimistic assumption. If it is not the last, the final row
|
|
calculation, if any, as for AVG, will be repeated for the next peer
|
|
row(s).
|
|
For optimized MIN/MAX [1], we do this to make sure we have a non-NULL
|
|
last value (if one exists) for the initial frame.
|
|
*/
|
|
const bool setstate =
|
|
(rowno == upper || range_frame || have_last_value /* [1] */);
|
|
if (setstate)
|
|
w.set_is_last_row_in_frame(true); // temporary state for next call
|
|
|
|
// Accumulate frame's row into WF's value for current_row:
|
|
if (copy_funcs(param, thd, CFT_WF_FRAMING)) return true;
|
|
|
|
if (setstate) w.set_is_last_row_in_frame(false); // undo temporary state
|
|
}
|
|
|
|
if (range_frame || rowno > upper) // no more rows in partition
|
|
{
|
|
if (range_frame) {
|
|
if (!first_row_in_range_frame_seen) {
|
|
/*
|
|
Empty frame: optimize starting point for next row: monotonic
|
|
increase in frame bounds
|
|
*/
|
|
w.set_first_rowno_in_range_frame(rowno);
|
|
}
|
|
}
|
|
w.set_last_rowno_in_range_frame(rowno - 1);
|
|
if (range_to_current_row) {
|
|
w.set_last_rowno_in_peerset(w.last_rowno_in_range_frame());
|
|
have_peers_current_row = true;
|
|
}
|
|
} // else: we already set it before breaking out of loop
|
|
}
|
|
|
|
/*
|
|
While the block above was for the default execution method, below we have
|
|
alternative blocks for optimized methods: static framing WFs and
|
|
inversion, when current_row isn't first; i.e. we can use the previous
|
|
row's value of framing WFs as a base.
|
|
In the row buffer of OUT, after the previous row was emitted, these values
|
|
of framing WFs are still present, as no copy_funcs(CFT_WF_FRAMING) was run
|
|
for our new row yet.
|
|
*/
|
|
if (static_aggregate && current_row != 1) {
|
|
/* Set up the correct non-wf fields for copying to the output row */
|
|
if (bring_back_frame_row(thd, w, param, current_row, Window::REA_CURRENT))
|
|
return true;
|
|
|
|
/* E.g. ROW_NUMBER, RANK, DENSE_RANK */
|
|
if (copy_funcs(param, thd, CFT_WF_NON_FRAMING)) return true;
|
|
} else if (row_optimizable && w.aggregates_primed()) {
|
|
/*
|
|
Rows 2..N in partition: we still have state from previous current row's
|
|
frame computation, now adjust by subtracting row 1 in frame (lower_limit)
|
|
and adding new, if any, final frame row
|
|
*/
|
|
const bool remove_previous_first_row =
|
|
(lower_limit > 1 && lower_limit - 1 <= last_rowno_in_cache);
|
|
const bool new_last_row =
|
|
(upper_limit <= upper &&
|
|
!unbounded_following /* all added when primed */);
|
|
const int64 rn_in_frame = upper - lower_limit + 1;
|
|
|
|
/* possibly subtract: early in partition there may not be any */
|
|
if (remove_previous_first_row) {
|
|
/*
|
|
Check if the row leaving the frame is the last row in the peerset
|
|
within a frame. If true, set is_last_row_in_peerset_within_frame
|
|
to true.
|
|
Used by JSON_OBJECTAGG to remove the key/value pair only
|
|
when it is the last row having that key value.
|
|
*/
|
|
if (needs_last_peer_in_frame) {
|
|
int64 rowno = lower_limit - 1;
|
|
bool is_last_row_in_peerset = true;
|
|
if (rowno < upper) {
|
|
if (bring_back_frame_row(thd, w, param, rowno,
|
|
Window::REA_LAST_IN_PEERSET))
|
|
return true;
|
|
// Establish current row as base-line for peer set.
|
|
w.reset_order_by_peer_set();
|
|
/*
|
|
Check if the next row is a peer to this row. If not
|
|
set current row as the last row in peerset within
|
|
frame.
|
|
*/
|
|
rowno++;
|
|
if (rowno < upper) {
|
|
if (bring_back_frame_row(thd, w, param, rowno,
|
|
Window::REA_LAST_IN_PEERSET))
|
|
return true;
|
|
// Compare only the first order by item.
|
|
if (!w.in_new_order_by_peer_set(false))
|
|
is_last_row_in_peerset = false;
|
|
}
|
|
}
|
|
if (is_last_row_in_peerset)
|
|
w.set_is_last_row_in_peerset_within_frame(true);
|
|
}
|
|
|
|
if (bring_back_frame_row(thd, w, param, lower_limit - 1,
|
|
Window::REA_FIRST_IN_FRAME))
|
|
return true;
|
|
|
|
w.set_inverse(true);
|
|
if (!new_last_row) {
|
|
w.set_rowno_in_frame(rn_in_frame);
|
|
if (rn_in_frame > 0)
|
|
w.set_is_last_row_in_frame(true); // do final comp., e.g. div in AVG
|
|
|
|
if (copy_funcs(param, thd, CFT_WF_FRAMING)) return true;
|
|
|
|
w.set_is_last_row_in_frame(false); // undo temporary states
|
|
} else {
|
|
if (copy_funcs(param, thd, CFT_WF_FRAMING)) return true;
|
|
}
|
|
|
|
w.set_is_last_row_in_peerset_within_frame(false);
|
|
w.set_inverse(false);
|
|
}
|
|
|
|
if (have_first_value && (lower_limit <= last_rowno_in_cache)) {
|
|
// We have seen first row of frame, FIRST_VALUE can be computed:
|
|
if (bring_back_frame_row(thd, w, param, lower_limit,
|
|
Window::REA_FIRST_IN_FRAME))
|
|
return true;
|
|
|
|
w.set_rowno_in_frame(1);
|
|
|
|
/*
|
|
Framing WFs which accumulate (SUM, COUNT, AVG) shouldn't accumulate
|
|
this row again as they have done so already. Evaluate only
|
|
X_VALUE/MIN/MAX.
|
|
*/
|
|
if (copy_funcs(param, thd, CFT_WF_USES_ONLY_ONE_ROW)) return true;
|
|
}
|
|
|
|
if (have_last_value && !new_last_row) {
|
|
// We have seen last row of frame, LAST_VALUE can be computed:
|
|
if (bring_back_frame_row(thd, w, param, upper, Window::REA_LAST_IN_FRAME))
|
|
return true;
|
|
|
|
w.set_rowno_in_frame(rn_in_frame);
|
|
|
|
if (rn_in_frame > 0) w.set_is_last_row_in_frame(true);
|
|
|
|
if (copy_funcs(param, thd, CFT_WF_USES_ONLY_ONE_ROW)) return true;
|
|
|
|
w.set_is_last_row_in_frame(false);
|
|
}
|
|
|
|
if (!have_nth_value.m_offsets.empty()) {
|
|
int fno = 0;
|
|
for (auto nth : have_nth_value.m_offsets) {
|
|
if (lower_limit + nth.m_rowno - 1 <= upper) {
|
|
if (bring_back_frame_row(thd, w, param, lower_limit + nth.m_rowno - 1,
|
|
Window::REA_MISC_POSITIONS, fno++))
|
|
return true;
|
|
|
|
w.set_rowno_in_frame(nth.m_rowno);
|
|
|
|
if (copy_funcs(param, thd, CFT_WF_USES_ONLY_ONE_ROW)) return true;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (new_last_row) // Add new last row to framing WF's value
|
|
{
|
|
if (bring_back_frame_row(thd, w, param, upper, Window::REA_LAST_IN_FRAME))
|
|
return true;
|
|
|
|
w.set_rowno_in_frame(upper - lower_limit + 1)
|
|
.set_is_last_row_in_frame(true); // temporary states for next copy
|
|
w.set_rowno_being_visited(upper);
|
|
|
|
if (copy_funcs(param, thd, CFT_WF_FRAMING)) return true;
|
|
|
|
w.set_is_last_row_in_frame(false); // undo temporary states
|
|
}
|
|
} else if (range_optimizable && w.aggregates_primed()) {
|
|
/*
|
|
Peer sets 2..N in partition: we still have state from previous current
|
|
row's frame computation, now adjust by possibly subtracting rows no
|
|
longer in frame and possibly adding new rows now within range.
|
|
*/
|
|
const int64 prev_last_rowno_in_frame = w.last_rowno_in_range_frame();
|
|
const int64 prev_first_rowno_in_frame = w.first_rowno_in_range_frame();
|
|
|
|
/*
|
|
As an optimization, if:
|
|
- RANGE frame specification ends at CURRENT ROW and
|
|
- current_row belongs to frame of previous row,
|
|
then both rows are peers, so have the same frame: nothing changes.
|
|
*/
|
|
if (range_to_current_row && current_row >= prev_first_rowno_in_frame &&
|
|
current_row <= prev_last_rowno_in_frame) {
|
|
// Peer set should already have been determined:
|
|
DBUG_ASSERT(w.last_rowno_in_peerset() >= current_row);
|
|
have_peers_current_row = true;
|
|
} else {
|
|
/**
|
|
Whether we know the start of the frame yet. The a priori setting is
|
|
inherited from the previous current row.
|
|
*/
|
|
bool found_first =
|
|
(prev_first_rowno_in_frame <= prev_last_rowno_in_frame);
|
|
int64 new_first_rowno_in_frame = prev_first_rowno_in_frame; // a priori
|
|
|
|
int64 inverted = 0; // Number of rows inverted when moving frame
|
|
int64 rowno; // Partition relative, loop counter
|
|
|
|
if (range_from_first_to_current_row) {
|
|
/*
|
|
No need to locate frame's start, it's first row of partition. No
|
|
need to recompute FIRST_VALUE, it's same as for previous row.
|
|
So we just have to accumulate new rows.
|
|
*/
|
|
DBUG_ASSERT(current_row > prev_last_rowno_in_frame &&
|
|
lower_limit == 1 && prev_first_rowno_in_frame == 1 &&
|
|
found_first);
|
|
} else {
|
|
for (rowno = lower_limit;
|
|
(rowno <= upper &&
|
|
prev_first_rowno_in_frame <= prev_last_rowno_in_frame);
|
|
rowno++) {
|
|
/* Set up the non-wf fields for aggregating to the output row. */
|
|
if (bring_back_frame_row(thd, w, param, rowno,
|
|
Window::REA_FIRST_IN_FRAME))
|
|
return true;
|
|
|
|
if (w.before_frame()) {
|
|
w.set_inverse(true)
|
|
.
|
|
/*
|
|
The next setting sets the logical last row number in the frame
|
|
after inversion, so that final actions can do the right thing,
|
|
e.g. AVG needs to know the updated cardinality. The
|
|
aggregates consults m_rowno_in_frame for that, so set it
|
|
accordingly.
|
|
*/
|
|
set_rowno_in_frame(prev_last_rowno_in_frame -
|
|
prev_first_rowno_in_frame + 1 - ++inverted)
|
|
.set_is_last_row_in_frame(true); // pessimistic assumption
|
|
|
|
// Set the current row as the last row in the peerset.
|
|
w.set_is_last_row_in_peerset_within_frame(true);
|
|
|
|
/*
|
|
It may be that rowno is not in previous frame; for example if
|
|
column id contains 1, 3, 4 and 5 and frame is RANGE BETWEEN 2
|
|
FOLLOWING AND 2 FOLLOWING: we process id=1, frame of id=1 is
|
|
id=3; then we process id=3: id=3 is before frame (and was in
|
|
previous frame), id=4 is before frame too (and was not in
|
|
previous frame); so id=3 only should be inverted:
|
|
*/
|
|
if (rowno >= prev_first_rowno_in_frame &&
|
|
rowno <= prev_last_rowno_in_frame) {
|
|
if (copy_funcs(param, thd, CFT_WF_FRAMING)) return true;
|
|
}
|
|
|
|
w.set_inverse(false).set_is_last_row_in_frame(false);
|
|
w.set_is_last_row_in_peerset_within_frame(false);
|
|
found_first = false;
|
|
} else {
|
|
if (w.after_frame()) {
|
|
found_first = false;
|
|
} else {
|
|
w.set_first_rowno_in_range_frame(rowno);
|
|
found_first = true;
|
|
new_first_rowno_in_frame = rowno;
|
|
w.set_rowno_in_frame(1);
|
|
}
|
|
|
|
break;
|
|
}
|
|
}
|
|
|
|
if ((have_first_value || have_last_value) &&
|
|
(rowno <= last_rowno_in_cache) && found_first) {
|
|
/*
|
|
We have FIRST_VALUE or LAST_VALUE and have a new first row; make it
|
|
last also until we find something better.
|
|
*/
|
|
w.set_is_last_row_in_frame(true);
|
|
w.set_rowno_being_visited(rowno);
|
|
|
|
if (copy_funcs(param, thd, CFT_WF_USES_ONLY_ONE_ROW)) return true;
|
|
w.set_is_last_row_in_frame(false);
|
|
|
|
if (have_last_value && w.last_rowno_in_range_frame() > rowno) {
|
|
/* Set up the non-wf fields for aggregating to the output row. */
|
|
if (bring_back_frame_row(thd, w, param,
|
|
w.last_rowno_in_range_frame(),
|
|
Window::REA_LAST_IN_FRAME))
|
|
return true;
|
|
|
|
w.set_rowno_in_frame(w.last_rowno_in_range_frame() -
|
|
w.first_rowno_in_range_frame() + 1)
|
|
.set_is_last_row_in_frame(true);
|
|
w.set_rowno_being_visited(w.last_rowno_in_range_frame());
|
|
if (copy_funcs(param, thd, CFT_WF_USES_ONLY_ONE_ROW)) return true;
|
|
w.set_is_last_row_in_frame(false);
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
We last evaluated last_rowno_in_range_frame for the previous current
|
|
row. Now evaluate over any new rows within range of the current row.
|
|
*/
|
|
const int64 first = w.last_rowno_in_range_frame() + 1;
|
|
bool row_added = false;
|
|
|
|
for (rowno = first; rowno <= upper; rowno++) {
|
|
w.save_pos(Window::REA_LAST_IN_FRAME);
|
|
if (bring_back_frame_row(thd, w, param, rowno,
|
|
Window::REA_LAST_IN_FRAME))
|
|
return true;
|
|
|
|
if (w.before_frame()) {
|
|
if (!found_first) new_first_rowno_in_frame++;
|
|
continue;
|
|
} else if (w.after_frame()) {
|
|
w.set_last_rowno_in_range_frame(rowno - 1);
|
|
if (!found_first) w.set_first_rowno_in_range_frame(rowno);
|
|
/*
|
|
We read one row too far, so reinstate previous hint for last in
|
|
frame. We will likely be reading the last row in frame
|
|
again in for next current row, and then we will need the hint.
|
|
*/
|
|
w.restore_pos(Window::REA_LAST_IN_FRAME);
|
|
break;
|
|
} // else: row is within range, process
|
|
|
|
const int64 rowno_in_frame = rowno - new_first_rowno_in_frame + 1;
|
|
|
|
if (rowno_in_frame == 1 && !found_first) {
|
|
found_first = true;
|
|
w.set_first_rowno_in_range_frame(rowno);
|
|
// Found the first row in this range frame. Make a note in the hint.
|
|
w.copy_pos(Window::REA_LAST_IN_FRAME, Window::REA_FIRST_IN_FRAME);
|
|
}
|
|
w.set_rowno_in_frame(rowno_in_frame)
|
|
.set_is_last_row_in_frame(true); // pessimistic assumption
|
|
w.set_rowno_being_visited(rowno);
|
|
|
|
if (copy_funcs(param, thd, CFT_WF_FRAMING)) return true;
|
|
|
|
w.set_is_last_row_in_frame(false); // undo temporary states
|
|
row_added = true;
|
|
}
|
|
|
|
if (rowno > upper && row_added)
|
|
w.set_last_rowno_in_range_frame(rowno - 1);
|
|
|
|
if (range_to_current_row) {
|
|
w.set_last_rowno_in_peerset(w.last_rowno_in_range_frame());
|
|
have_peers_current_row = true;
|
|
}
|
|
|
|
if (found_first && !have_nth_value.m_offsets.empty()) {
|
|
// frame is non-empty, so we might find NTH_VALUE
|
|
DBUG_ASSERT(w.first_rowno_in_range_frame() <=
|
|
w.last_rowno_in_range_frame());
|
|
int fno = 0;
|
|
for (auto nth : have_nth_value.m_offsets) {
|
|
const int64 row_to_get =
|
|
w.first_rowno_in_range_frame() + nth.m_rowno - 1;
|
|
if (row_to_get <= w.last_rowno_in_range_frame()) {
|
|
if (bring_back_frame_row(thd, w, param, row_to_get,
|
|
Window::REA_MISC_POSITIONS, fno++))
|
|
return true;
|
|
|
|
w.set_rowno_in_frame(nth.m_rowno);
|
|
|
|
if (copy_funcs(param, thd, CFT_WF_USES_ONLY_ONE_ROW)) return true;
|
|
}
|
|
}
|
|
}
|
|
|
|
// We have empty frame, maintain invariant
|
|
if (!found_first) {
|
|
DBUG_ASSERT(!row_added);
|
|
w.set_first_rowno_in_range_frame(w.last_rowno_in_range_frame() + 1);
|
|
}
|
|
}
|
|
}
|
|
|
|
/* We need the peer of the current row to evaluate the row. */
|
|
if (needs_peerset && !have_peers_current_row) {
|
|
int64 first = current_row;
|
|
|
|
if (current_row != 1) first = w.last_rowno_in_peerset() + 1;
|
|
|
|
if (current_row >= first) {
|
|
int64 rowno;
|
|
for (rowno = current_row; rowno <= last_rowno_in_cache; rowno++) {
|
|
if (bring_back_frame_row(thd, w, param, rowno,
|
|
Window::REA_LAST_IN_PEERSET))
|
|
return true;
|
|
|
|
if (rowno == current_row) {
|
|
/* establish current row as base-line for peer set */
|
|
w.reset_order_by_peer_set();
|
|
w.set_last_rowno_in_peerset(current_row);
|
|
} else if (w.in_new_order_by_peer_set()) {
|
|
w.set_last_rowno_in_peerset(rowno - 1);
|
|
break; // we have accumulated all rows in the peer set
|
|
}
|
|
}
|
|
if (rowno > last_rowno_in_cache)
|
|
w.set_last_rowno_in_peerset(last_rowno_in_cache);
|
|
}
|
|
}
|
|
|
|
if (optimizable && optimizable_primed) w.set_aggregates_primed(true);
|
|
|
|
if (bring_back_frame_row(thd, w, param, current_row, Window::REA_CURRENT))
|
|
return true;
|
|
|
|
/* NTILE and other non-framing wfs */
|
|
if (w.needs_card()) {
|
|
/* Set up the non-wf fields for aggregating to the output row. */
|
|
if (process_wfs_needing_card(thd, param, have_nth_value, have_lead_lag,
|
|
current_row, w, Window::REA_CURRENT))
|
|
return true;
|
|
}
|
|
|
|
if (w.is_last() && copy_funcs(param, thd, CFT_HAS_WF)) return true;
|
|
*output_row_ready = true;
|
|
w.set_last_row_output(current_row);
|
|
DBUG_PRINT("info", ("sent row: %" PRId64, current_row));
|
|
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
The last step in a series of windows do not need to write a tmp file
|
|
if both a) and b) holds:
|
|
|
|
a) no SELECT DISTINCT
|
|
b) no final ORDER BY
|
|
|
|
that have not been eliminated. If the condition is true, we send the data
|
|
direct over the protocol to save the trip back and from the tmp file
|
|
*/
|
|
static inline enum_nested_loop_state write_or_send_row(
|
|
JOIN *join, QEP_TAB *const qep_tab, TABLE *const table,
|
|
Temp_table_param *const out_tbl) {
|
|
if (out_tbl->m_window_short_circuit) {
|
|
if (join->send_records >= join->unit->select_limit_cnt)
|
|
return NESTED_LOOP_QUERY_LIMIT;
|
|
enum_nested_loop_state nls =
|
|
(*qep_tab->next_select)(join, qep_tab + 1, false);
|
|
return nls;
|
|
}
|
|
int error;
|
|
if ((error = table->file->ha_write_row(table->record[0]))) {
|
|
if (table->file->is_ignorable_error(error)) return NESTED_LOOP_OK;
|
|
|
|
/*
|
|
- Convert to disk-based table,
|
|
- and setup index access over hash field; that is usually done by
|
|
QEP_tmp_table::prepare_tmp_table() but we may have a set of buffered
|
|
rows to write before such function is executed.
|
|
*/
|
|
if (create_ondisk_from_heap(join->thd, table, error, true, NULL) ||
|
|
(table->hash_field && table->file->ha_index_init(0, 0)))
|
|
return NESTED_LOOP_ERROR; // Not a table_is_full error
|
|
}
|
|
|
|
if (++qep_tab->send_records >= out_tbl->end_write_records &&
|
|
join->do_send_rows) {
|
|
if (!join->calc_found_rows) return NESTED_LOOP_QUERY_LIMIT;
|
|
join->do_send_rows = 0;
|
|
join->unit->select_limit_cnt = HA_POS_ERROR;
|
|
return NESTED_LOOP_OK;
|
|
}
|
|
join->thd->get_stmt_da()->inc_current_row_for_condition();
|
|
|
|
return NESTED_LOOP_OK;
|
|
}
|
|
|
|
/* ARGSUSED */
|
|
static enum_nested_loop_state end_write(JOIN *join, QEP_TAB *const qep_tab,
|
|
bool end_of_records) {
|
|
DBUG_TRACE;
|
|
|
|
TABLE *const table = qep_tab->table();
|
|
|
|
if (join->thd->killed) // Aborted by user
|
|
{
|
|
join->thd->send_kill_message();
|
|
return NESTED_LOOP_KILLED; /* purecov: inspected */
|
|
}
|
|
if (!end_of_records) {
|
|
Temp_table_param *const tmp_tbl = qep_tab->tmp_table_param;
|
|
Switch_ref_item_slice slice_switch(join, qep_tab->ref_item_slice);
|
|
DBUG_ASSERT(qep_tab - 1 != join->ref_slice_immediately_before_group_by);
|
|
|
|
if (copy_fields_and_funcs(tmp_tbl, join->thd))
|
|
return NESTED_LOOP_ERROR; /* purecov: inspected */
|
|
|
|
if (having_is_true(qep_tab->having)) {
|
|
int error;
|
|
join->found_records++;
|
|
|
|
if (!check_unique_constraint(table)) goto end; // skip it
|
|
|
|
if ((error = table->file->ha_write_row(table->record[0]))) {
|
|
if (table->file->is_ignorable_error(error)) goto end;
|
|
if (create_ondisk_from_heap(join->thd, table, error, true, NULL))
|
|
return NESTED_LOOP_ERROR; // Not a table_is_full error
|
|
}
|
|
if (++qep_tab->send_records >= tmp_tbl->end_write_records &&
|
|
join->do_send_rows) {
|
|
if (!join->calc_found_rows) return NESTED_LOOP_QUERY_LIMIT;
|
|
join->do_send_rows = 0;
|
|
join->unit->select_limit_cnt = HA_POS_ERROR;
|
|
return NESTED_LOOP_OK;
|
|
}
|
|
join->thd->get_stmt_da()->inc_current_row_for_condition();
|
|
}
|
|
}
|
|
end:
|
|
return NESTED_LOOP_OK;
|
|
}
|
|
|
|
/* ARGSUSED */
|
|
|
|
/**
|
|
Similar to end_write, but used in the windowing tmp table steps
|
|
*/
|
|
static enum_nested_loop_state end_write_wf(JOIN *join, QEP_TAB *const qep_tab,
|
|
bool end_of_records) {
|
|
DBUG_TRACE;
|
|
THD *const thd = join->thd;
|
|
|
|
if (thd->killed) // Aborted by user
|
|
{
|
|
thd->send_kill_message();
|
|
return NESTED_LOOP_KILLED; /* purecov: inspected */
|
|
}
|
|
|
|
Temp_table_param *const out_tbl = qep_tab->tmp_table_param;
|
|
|
|
/**
|
|
If we don't need to buffer rows to evaluate the window functions, execution
|
|
is simple, see logic below. In that case we can just evaluate the
|
|
window functions as we go here, similar to the non windowing flow,
|
|
cf. copy_funcs below and in end_write.
|
|
|
|
If we do need buffering, though, we buffer the row here. Next, we enter a
|
|
loop calling process_buffered_windowing_record and conditionally write (or
|
|
send) the row onward. That is, if process_buffered_windowing_record was
|
|
able to complete evaluation of a row (cf. output_row_ready), including its
|
|
window functions given how much has already been buffered, we do the write
|
|
(or send), else we exit, and postpone evaluation and writing till we have
|
|
enough rows in the buffer.
|
|
|
|
When we have read a full partition (or reach EOF), we evaluate any remaining
|
|
rows. Note that since we have to read one row past the current partition to
|
|
detect that that previous row was indeed the last row in a partition, we
|
|
need to re-establish the first row of the next partition when we are done
|
|
processing the current one. This is because the record will be overwritten
|
|
(many times) during evaluation of window functions in the current partition.
|
|
|
|
Usually [1], for window execution we have two or three tmp tables per
|
|
windowing step involved:
|
|
|
|
- The input table, corresponding to qep_tab-1. Holds (possibly sorted)
|
|
records ready for windowing, sorted on expressions concatenated from
|
|
any PARTITION BY and ORDER BY clauses.
|
|
|
|
- The output table, corresponding to qep_tab: where we write the evaluated
|
|
records from this step. Note that we may optimize away this last write if
|
|
we have no final ORDER BY or DISTINCT, see write_or_send_row.
|
|
|
|
- If we have buffering, the frame buffer, held by
|
|
Window::m_frame_buffer[_param]
|
|
|
|
[1] This is not always the case. For the first window, if we have no
|
|
PARTITION BY or ORDER BY in the window, and there is more than one table
|
|
in the join, the logical input can consist of more than one table
|
|
(qep_tab-1 .. qep_tab-n).
|
|
|
|
The first thing we do in this function, is:
|
|
we copy fields from IN to OUT (copy_fields), and evaluate non-WF functions
|
|
(copy_funcs): those functions then read their arguments from IN and store
|
|
their result into their result_field which is a field in OUT.
|
|
We then evaluate any HAVING, on OUT table.
|
|
The next steps depend on if we have a FB (Frame Buffer) or not.
|
|
|
|
(a) If we have no FB, we immediately calculate the WFs over the OUT row,
|
|
store their value in OUT row, and pass control to next plan operator
|
|
(write_or_send_row) - we're done.
|
|
|
|
(b) If we have a FB, let's take SUM(A+FLOOR(B)) OVER (ROWS 2 FOLLOWING) as
|
|
example. Above, we have stored A and the result of FLOOR in OUT. Now we
|
|
buffer (save) the row into the FB: for that, we copy field A from IN to
|
|
FB, and FLOOR's result_field from OUT to FB; a single copy_fields() call
|
|
handles both copy jobs.
|
|
Then we look at the rows we have buffered and may realize that we have
|
|
enough of the frame to calculate SUM for a certain row (not necessarily
|
|
the one we just buffered; might be an earlier row, in our example it is
|
|
the row which is 2 rows above the buffered row). If we do, to calculate
|
|
WFs, we bring back the frame's rows; which is done by:
|
|
first copying field A and FLOOR's result_field in directions
|
|
opposite to above (using one copy_fields), then copying field A from IN to
|
|
OUT, thus getting in OUT all that SUM needs (A and FLOOR), then giving
|
|
that OUT row to SUM (SUM will then add the row's value to its total; that
|
|
happens in copy_funcs). After we have done that on all rows of the frame,
|
|
we have the values of SUM ready in OUT, we also restore the row which owns
|
|
this SUM value, in the same way as we restored the frame's rows, and
|
|
we pass control to next plan operator (write_or_send_row) - we're done for
|
|
this row. However, when the next plan operator is done and we regain
|
|
control, we loop to check if we can calculate one more row with the frame
|
|
we have, and if so, we do. Until we can't calculate any more row in which
|
|
case we're back to just buffering.
|
|
|
|
@todo If we have buffering, for fields (not result_field of non-WF
|
|
functions), we do:
|
|
copy_fields IN->OUT, copy_fields IN->FB (buffering phase), and later
|
|
(restoration phase): copy_fields FB->IN, copy_fields IN->OUT.
|
|
The copy_fields IN->OUT before buffering, is useless as the OUT values
|
|
will not be used (they'll be overwritten). We have two possible
|
|
alternative improvements, any of which would avoid one copying:
|
|
- remove this copy_fields (the buffering-phase IN->OUT)
|
|
- keep it but change the rest to: OUT->FB, FB->OUT; that eliminates the
|
|
restoration-phase IN->OUT; this design would be in line with what is done
|
|
for result_field of non-WF functions.
|
|
*/
|
|
Window *const win = out_tbl->m_window;
|
|
const bool window_buffering = win->needs_buffering();
|
|
|
|
if (end_of_records && !window_buffering) return NESTED_LOOP_OK;
|
|
|
|
/*
|
|
All evaluations of functions, done in process_buffered_windowing_record()
|
|
and copy_funcs(), are using values of the out table, so we must use its
|
|
slice:
|
|
*/
|
|
Switch_ref_item_slice slice_switch(join, qep_tab->ref_item_slice);
|
|
DBUG_ASSERT(qep_tab - 1 != join->ref_slice_immediately_before_group_by &&
|
|
qep_tab != join->ref_slice_immediately_before_group_by);
|
|
|
|
TABLE *const table = qep_tab->table();
|
|
if (window_buffering) {
|
|
bool new_partition = false;
|
|
if (!end_of_records) {
|
|
/*
|
|
This saves the values of non-WF functions for the row. For example,
|
|
1+t.a.
|
|
*/
|
|
if (copy_fields_and_funcs(out_tbl, thd, CFT_HAS_NO_WF))
|
|
return NESTED_LOOP_ERROR; /* purecov: inspected */
|
|
|
|
if (!having_is_true(qep_tab->having))
|
|
goto end; // Didn't match having, skip it
|
|
|
|
if (buffer_windowing_record(thd, out_tbl, &new_partition))
|
|
return NESTED_LOOP_ERROR;
|
|
|
|
join->found_records++;
|
|
}
|
|
|
|
repeat:
|
|
while (true) {
|
|
bool output_row_ready = false;
|
|
if (process_buffered_windowing_record(
|
|
thd, out_tbl, new_partition || end_of_records, &output_row_ready))
|
|
return NESTED_LOOP_ERROR;
|
|
|
|
if (!output_row_ready) break;
|
|
|
|
if (!check_unique_constraint(table)) // In case of SELECT DISTINCT
|
|
continue; // skip it
|
|
|
|
enum_nested_loop_state result;
|
|
if ((result = write_or_send_row(join, qep_tab, table, out_tbl)))
|
|
return result; // Not a table_is_full error
|
|
|
|
if (thd->killed) // Aborted by user
|
|
{
|
|
thd->send_kill_message();
|
|
return NESTED_LOOP_KILLED;
|
|
}
|
|
}
|
|
|
|
if (new_partition) {
|
|
/*
|
|
We didn't really buffer this row yet since, we found a partition
|
|
change so we had to finalize the previous partition first.
|
|
Bring back saved row for next partition.
|
|
*/
|
|
if (bring_back_frame_row(thd, *win, out_tbl,
|
|
Window::FBC_FIRST_IN_NEXT_PARTITION,
|
|
Window::REA_WONT_UPDATE_HINT))
|
|
return NESTED_LOOP_ERROR;
|
|
|
|
/*
|
|
copy_funcs(CFT_NON_WF) is not necessary: a non-WF function was
|
|
calculated and saved in OUT, then this OUT column was copied to
|
|
special record, then restored to OUT column.
|
|
*/
|
|
|
|
win->reset_partition_state();
|
|
if (buffer_windowing_record(thd, out_tbl,
|
|
nullptr /* first in new partition */))
|
|
return NESTED_LOOP_ERROR;
|
|
new_partition = false;
|
|
goto repeat;
|
|
}
|
|
if (!end_of_records && win->needs_restore_input_row()) {
|
|
/*
|
|
Reestablish last row read from input table in case it is needed again
|
|
before reading a new row. May be necessary if this is the first window
|
|
following after a join, cf. the caching presumption in
|
|
EQRefIterator. This logic can be removed if we move to copying
|
|
between out tmp record and frame buffer record, instead of involving the
|
|
in record. FIXME.
|
|
*/
|
|
if (bring_back_frame_row(thd, *win, nullptr /* no copy to OUT */,
|
|
Window::FBC_LAST_BUFFERED_ROW,
|
|
Window::REA_WONT_UPDATE_HINT))
|
|
return NESTED_LOOP_ERROR;
|
|
}
|
|
} else {
|
|
if (copy_fields_and_funcs(out_tbl, thd, CFT_HAS_NO_WF))
|
|
return NESTED_LOOP_ERROR; /* purecov: inspected */
|
|
|
|
if (!having_is_true(qep_tab->having))
|
|
goto end; // Didn't match having, skip it
|
|
|
|
win->check_partition_boundary();
|
|
|
|
if (copy_funcs(out_tbl, thd, CFT_WF))
|
|
return NESTED_LOOP_ERROR; /* purecov: inspected */
|
|
|
|
if (win->is_last() && copy_funcs(out_tbl, thd, CFT_HAS_WF))
|
|
return NESTED_LOOP_ERROR; /* purecov: inspected */
|
|
|
|
join->found_records++;
|
|
|
|
if (!check_unique_constraint(table)) // In case of SELECT DISTINCT
|
|
goto end; // skip it
|
|
|
|
DBUG_PRINT("info", ("end_write: writing record at %p", table->record[0]));
|
|
|
|
enum_nested_loop_state result;
|
|
if ((result = write_or_send_row(join, qep_tab, table, out_tbl)))
|
|
return result; // Not a table_is_full error
|
|
}
|
|
end:
|
|
return NESTED_LOOP_OK;
|
|
}
|
|
|
|
/* ARGSUSED */
|
|
/** Group by searching after group record and updating it if possible. */
|
|
|
|
static enum_nested_loop_state end_update(JOIN *join, QEP_TAB *const qep_tab,
|
|
bool end_of_records) {
|
|
TABLE *const table = qep_tab->table();
|
|
ORDER *group;
|
|
int error;
|
|
bool group_found = false;
|
|
DBUG_TRACE;
|
|
|
|
if (end_of_records) return NESTED_LOOP_OK;
|
|
if (join->thd->killed) // Aborted by user
|
|
{
|
|
join->thd->send_kill_message();
|
|
return NESTED_LOOP_KILLED; /* purecov: inspected */
|
|
}
|
|
|
|
Temp_table_param *const tmp_tbl = qep_tab->tmp_table_param;
|
|
join->found_records++;
|
|
|
|
// See comment below.
|
|
DBUG_ASSERT(tmp_tbl->grouped_expressions.size() == 0);
|
|
|
|
if (copy_fields(tmp_tbl, join->thd)) // Groups are copied twice.
|
|
return NESTED_LOOP_ERROR; /* purecov: inspected */
|
|
|
|
/* Make a key of group index */
|
|
if (table->hash_field) {
|
|
/*
|
|
We need to call to copy_funcs here in order to get correct value for
|
|
hash_field. However, this call isn't needed so early when hash_field
|
|
isn't used as it would cause unnecessary additional evaluation of
|
|
functions to be copied when 2nd and further records in group are
|
|
found.
|
|
*/
|
|
if (copy_funcs(tmp_tbl, join->thd))
|
|
return NESTED_LOOP_ERROR; /* purecov: inspected */
|
|
if (!check_unique_constraint(table)) group_found = true;
|
|
} else {
|
|
for (group = table->group; group; group = group->next) {
|
|
Item *item = *group->item;
|
|
item->save_org_in_field(group->field_in_tmp_table);
|
|
/* Store in the used key if the field was 0 */
|
|
if (item->maybe_null)
|
|
group->buff[-1] = (char)group->field_in_tmp_table->is_null();
|
|
}
|
|
const uchar *key = tmp_tbl->group_buff;
|
|
if (!table->file->ha_index_read_map(table->record[1], key, HA_WHOLE_KEY,
|
|
HA_READ_KEY_EXACT))
|
|
group_found = true;
|
|
}
|
|
if (group_found) {
|
|
/* Update old record */
|
|
restore_record(table, record[1]);
|
|
update_tmptable_sum_func(join->sum_funcs, table);
|
|
if ((error =
|
|
table->file->ha_update_row(table->record[1], table->record[0]))) {
|
|
// Old and new records are the same, ok to ignore
|
|
if (error == HA_ERR_RECORD_IS_THE_SAME) return NESTED_LOOP_OK;
|
|
table->file->print_error(error, MYF(0)); /* purecov: inspected */
|
|
return NESTED_LOOP_ERROR; /* purecov: inspected */
|
|
}
|
|
return NESTED_LOOP_OK;
|
|
}
|
|
|
|
/*
|
|
Why, unlike in other end_* functions, do we advance the slice here and not
|
|
before copy_fields()?
|
|
Because of the evaluation of *group->item above: if we do it with this tmp
|
|
table's slice, *group->item points to the field materializing the
|
|
expression, which hasn't been calculated yet. We could force the missing
|
|
calculation by doing copy_funcs() before evaluating *group->item; but
|
|
then, for a group made of N rows, we might be doing N evaluations of
|
|
another function when only one would suffice (like the '*' in
|
|
"SELECT a, a*a ... GROUP BY a": only the first/last row of the group,
|
|
needs to evaluate a*a).
|
|
|
|
The assertion on tmp_tbl->grouped_expressions.size() is to make sure
|
|
copy_fields() doesn't suffer from the late switching.
|
|
*/
|
|
Switch_ref_item_slice slice_switch(join, qep_tab->ref_item_slice);
|
|
DBUG_ASSERT(qep_tab - 1 != join->ref_slice_immediately_before_group_by &&
|
|
qep_tab != join->ref_slice_immediately_before_group_by);
|
|
|
|
/*
|
|
Copy null bits from group key to table
|
|
We can't copy all data as the key may have different format
|
|
as the row data (for example as with VARCHAR keys)
|
|
*/
|
|
if (!table->hash_field) {
|
|
KEY_PART_INFO *key_part;
|
|
for (group = table->group, key_part = table->key_info[0].key_part; group;
|
|
group = group->next, key_part++) {
|
|
// Field null indicator is located one byte ahead of field value.
|
|
// @todo - check if this NULL byte is really necessary for grouping
|
|
if (key_part->null_bit)
|
|
memcpy(table->record[0] + key_part->offset - 1, group->buff - 1, 1);
|
|
}
|
|
/* See comment on copy_funcs above. */
|
|
|
|
if (copy_funcs(tmp_tbl, join->thd))
|
|
return NESTED_LOOP_ERROR; /* purecov: inspected */
|
|
}
|
|
init_tmptable_sum_functions(join->sum_funcs);
|
|
if ((error = table->file->ha_write_row(table->record[0]))) {
|
|
if (create_ondisk_from_heap(join->thd, table, error, false, NULL))
|
|
return NESTED_LOOP_ERROR; // Not a table_is_full error
|
|
/* Change method to update rows */
|
|
if ((error = table->file->ha_index_init(0, 0))) {
|
|
table->file->print_error(error, MYF(0));
|
|
return NESTED_LOOP_ERROR;
|
|
}
|
|
}
|
|
qep_tab->send_records++;
|
|
return NESTED_LOOP_OK;
|
|
}
|
|
|
|
enum_nested_loop_state end_write_group(JOIN *join, QEP_TAB *const qep_tab,
|
|
bool end_of_records) {
|
|
TABLE *table = qep_tab->table();
|
|
int idx = -1;
|
|
DBUG_TRACE;
|
|
|
|
if (join->thd->killed) { // Aborted by user
|
|
join->thd->send_kill_message();
|
|
return NESTED_LOOP_KILLED; /* purecov: inspected */
|
|
}
|
|
/*
|
|
(1) Haven't seen a first row yet
|
|
(2) Have seen all rows
|
|
(3) GROUP expression are different from previous row's
|
|
*/
|
|
if (!join->seen_first_record || // (1)
|
|
end_of_records || // (2)
|
|
(idx = update_item_cache_if_changed(join->group_fields)) >= 0) // (3)
|
|
{
|
|
Temp_table_param *const tmp_tbl = qep_tab->tmp_table_param;
|
|
if (join->seen_first_record || (end_of_records && !join->grouped)) {
|
|
if (idx < (int)join->send_group_parts) {
|
|
/*
|
|
As GROUP expressions have changed, we now send forward the group
|
|
of the previous row.
|
|
*/
|
|
Switch_ref_item_slice slice_switch(join, qep_tab->ref_item_slice);
|
|
DBUG_ASSERT(qep_tab - 1 !=
|
|
join->ref_slice_immediately_before_group_by &&
|
|
qep_tab != join->ref_slice_immediately_before_group_by);
|
|
{
|
|
table_map save_nullinfo = 0;
|
|
if (!join->seen_first_record) {
|
|
// Calculate aggregate functions for no rows
|
|
for (Item &item : *join->get_current_fields()) {
|
|
item.no_rows_in_result();
|
|
}
|
|
|
|
/*
|
|
Mark tables as containing only NULL values for ha_write_row().
|
|
Calculate a set of tables for which NULL values need to
|
|
be restored after sending data.
|
|
*/
|
|
if (join->clear_fields(&save_nullinfo))
|
|
return NESTED_LOOP_ERROR; /* purecov: inspected */
|
|
}
|
|
copy_sum_funcs(join->sum_funcs,
|
|
join->sum_funcs_end[join->send_group_parts]);
|
|
if (having_is_true(qep_tab->having)) {
|
|
int error = table->file->ha_write_row(table->record[0]);
|
|
if (error &&
|
|
create_ondisk_from_heap(join->thd, table, error, false, NULL))
|
|
return NESTED_LOOP_ERROR;
|
|
}
|
|
if (join->rollup.state != ROLLUP::STATE_NONE) {
|
|
if (join->rollup_write_data((uint)(idx + 1), qep_tab))
|
|
return NESTED_LOOP_ERROR;
|
|
}
|
|
// Restore NULL values if needed.
|
|
if (save_nullinfo) join->restore_fields(save_nullinfo);
|
|
}
|
|
if (end_of_records) return NESTED_LOOP_OK;
|
|
}
|
|
} else {
|
|
if (end_of_records) return NESTED_LOOP_OK;
|
|
join->seen_first_record = true;
|
|
|
|
// Initialize the cache of GROUP expressions with this 1st row's values
|
|
(void)(update_item_cache_if_changed(join->group_fields));
|
|
}
|
|
if (idx < (int)join->send_group_parts) {
|
|
/*
|
|
As GROUP expressions have changed, initialize the new group:
|
|
(1) copy non-aggregated expressions (they're constant over the group)
|
|
(2) and reset group aggregate functions.
|
|
|
|
About (1): some expressions to copy are not Item_fields and they are
|
|
copied by copy_fields() which evaluates them (see
|
|
param->grouped_expressions, set up in setup_copy_fields()). Thus,
|
|
copy_fields() can evaluate functions. One of them, F2, may reference
|
|
another one F1, example: SELECT expr AS F1 ... GROUP BY ... HAVING
|
|
F2(F1)<=2 . Assume F1 and F2 are not aggregate functions. Then they are
|
|
calculated by copy_fields() when starting a new group, i.e. here. As F2
|
|
uses an alias to F1, F1 is calculated first; F2 must use that value (not
|
|
evaluate expr again, as expr may not be deterministic), so F2 uses a
|
|
reference (Item_ref) to the already-computed value of F1; that value is
|
|
in Item_copy part of REF_SLICE_ORDERED_GROUP_BY. So, we switch to that
|
|
slice.
|
|
*/
|
|
Switch_ref_item_slice slice_switch(join, qep_tab->ref_item_slice);
|
|
if (copy_fields_and_funcs(tmp_tbl, join->thd)) // (1)
|
|
return NESTED_LOOP_ERROR;
|
|
if (init_sum_functions(join->sum_funcs,
|
|
join->sum_funcs_end[idx + 1])) //(2)
|
|
return NESTED_LOOP_ERROR;
|
|
return NESTED_LOOP_OK;
|
|
}
|
|
}
|
|
if (update_sum_func(join->sum_funcs)) return NESTED_LOOP_ERROR;
|
|
return NESTED_LOOP_OK;
|
|
}
|
|
|
|
/*****************************************************************************
|
|
Remove duplicates from tmp table
|
|
This should be recoded to add a unique index to the table and remove
|
|
duplicates
|
|
Table is a locked single thread table
|
|
fields is the number of fields to check (from the end)
|
|
*****************************************************************************/
|
|
|
|
static bool compare_record(TABLE *table, Field **ptr) {
|
|
for (; *ptr; ptr++) {
|
|
if ((*ptr)->cmp_offset(table->s->rec_buff_length)) return 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static bool copy_blobs(Field **ptr) {
|
|
for (; *ptr; ptr++) {
|
|
if ((*ptr)->flags & BLOB_FLAG)
|
|
if (((Field_blob *)(*ptr))->copy()) return 1; // Error
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static void free_blobs(Field **ptr) {
|
|
for (; *ptr; ptr++) {
|
|
if ((*ptr)->flags & BLOB_FLAG) ((Field_blob *)(*ptr))->mem_free();
|
|
}
|
|
}
|
|
|
|
/**
|
|
For a set of fields, compute how many bytes their respective sort keys need.
|
|
|
|
@param first_field Array of fields, terminated by nullptr.
|
|
@param[out] field_lengths The computed sort buffer length for each field.
|
|
Must be allocated by the caller.
|
|
|
|
@retval The total number of bytes needed, sans extra alignment.
|
|
|
|
@note
|
|
This assumes that Field::sort_length() is constant for each field.
|
|
*/
|
|
|
|
static size_t compute_field_lengths(Field **first_field,
|
|
size_t *field_lengths) {
|
|
Field **field;
|
|
size_t *field_length;
|
|
size_t total_length = 0;
|
|
for (field = first_field, field_length = field_lengths; *field;
|
|
++field, ++field_length) {
|
|
size_t length = (*field)->sort_length();
|
|
const CHARSET_INFO *cs = (*field)->sort_charset();
|
|
length = cs->coll->strnxfrmlen(cs, length);
|
|
|
|
if ((*field)->sort_key_is_varlen()) {
|
|
// Make room for the length.
|
|
length += sizeof(uint32);
|
|
}
|
|
|
|
*field_length = length;
|
|
total_length += length;
|
|
}
|
|
return total_length;
|
|
}
|
|
|
|
bool QEP_TAB::remove_duplicates() {
|
|
bool error;
|
|
DBUG_ASSERT(this - 1 != join()->ref_slice_immediately_before_group_by &&
|
|
this != join()->ref_slice_immediately_before_group_by);
|
|
THD *thd = join()->thd;
|
|
DBUG_TRACE;
|
|
|
|
DBUG_ASSERT(join()->tmp_tables > 0 && table()->s->tmp_table != NO_TMP_TABLE);
|
|
|
|
TABLE *const tbl = table();
|
|
|
|
tbl->reginfo.lock_type = TL_WRITE;
|
|
|
|
Opt_trace_object trace_wrapper(&thd->opt_trace);
|
|
trace_wrapper.add("eliminating_duplicates_from_table_in_plan_at_position",
|
|
idx());
|
|
|
|
// How many saved fields there is in list
|
|
uint field_count = tbl->s->fields - tmp_table_param->hidden_field_count;
|
|
DBUG_ASSERT((int)field_count >= 0);
|
|
|
|
if (!field_count && !join()->calc_found_rows &&
|
|
!having) { // only const items with no OPTION_FOUND_ROWS
|
|
join()->unit->select_limit_cnt = 1; // Only send first row
|
|
needs_duplicate_removal = false;
|
|
return false;
|
|
}
|
|
Field **first_field = tbl->field + tbl->s->fields - field_count;
|
|
|
|
size_t *field_lengths =
|
|
(size_t *)my_malloc(key_memory_hash_index_key_buffer,
|
|
field_count * sizeof(*field_lengths), MYF(MY_WME));
|
|
if (field_lengths == nullptr) return true;
|
|
|
|
size_t key_length = compute_field_lengths(first_field, field_lengths);
|
|
|
|
free_io_cache(tbl); // Safety
|
|
tbl->file->info(HA_STATUS_VARIABLE);
|
|
constexpr int HASH_OVERHEAD = 16; // Very approximate.
|
|
if (!tbl->s->blob_fields &&
|
|
(tbl->s->db_type() == temptable_hton || tbl->s->db_type() == heap_hton ||
|
|
((ALIGN_SIZE(key_length) + HASH_OVERHEAD) * tbl->file->stats.records <
|
|
join()->thd->variables.sortbuff_size)))
|
|
error = remove_dup_with_hash_index(thd, tbl, first_field, field_lengths,
|
|
key_length, having);
|
|
else {
|
|
ulong offset =
|
|
field_count
|
|
? tbl->field[tbl->s->fields - field_count]->offset(tbl->record[0])
|
|
: 0;
|
|
error = remove_dup_with_compare(thd, tbl, first_field, offset, having);
|
|
}
|
|
|
|
my_free(field_lengths);
|
|
|
|
free_blobs(first_field);
|
|
needs_duplicate_removal = false;
|
|
return error;
|
|
}
|
|
|
|
static bool remove_dup_with_compare(THD *thd, TABLE *table, Field **first_field,
|
|
ulong offset, Item *having) {
|
|
handler *file = table->file;
|
|
char *org_record, *new_record;
|
|
uchar *record;
|
|
int error;
|
|
ulong reclength = table->s->reclength - offset;
|
|
DBUG_TRACE;
|
|
|
|
org_record = (char *)(record = table->record[0]) + offset;
|
|
new_record = (char *)table->record[1] + offset;
|
|
|
|
if ((error = file->ha_rnd_init(1))) goto err;
|
|
error = file->ha_rnd_next(record);
|
|
for (;;) {
|
|
if (thd->killed) {
|
|
thd->send_kill_message();
|
|
error = 0;
|
|
goto err;
|
|
}
|
|
if (error) {
|
|
if (error == HA_ERR_RECORD_DELETED) {
|
|
error = file->ha_rnd_next(record);
|
|
continue;
|
|
}
|
|
if (error == HA_ERR_END_OF_FILE) break;
|
|
goto err;
|
|
}
|
|
if (!having_is_true(having)) {
|
|
if ((error = file->ha_delete_row(record))) goto err;
|
|
error = file->ha_rnd_next(record);
|
|
continue;
|
|
}
|
|
if (copy_blobs(first_field)) {
|
|
error = 0;
|
|
goto err;
|
|
}
|
|
memcpy(new_record, org_record, reclength);
|
|
|
|
/* Read through rest of file and mark duplicated rows deleted */
|
|
bool found = 0;
|
|
for (;;) {
|
|
if ((error = file->ha_rnd_next(record))) {
|
|
if (error == HA_ERR_RECORD_DELETED) continue;
|
|
if (error == HA_ERR_END_OF_FILE) break;
|
|
goto err;
|
|
}
|
|
if (compare_record(table, first_field) == 0) {
|
|
if ((error = file->ha_delete_row(record))) goto err;
|
|
} else if (!found) {
|
|
found = 1;
|
|
file->position(record); // Remember position
|
|
}
|
|
}
|
|
if (!found) break; // End of file
|
|
/* Restart search on next row */
|
|
error = file->ha_rnd_pos(record, file->ref);
|
|
}
|
|
|
|
return false;
|
|
err:
|
|
if (file->inited) (void)file->ha_rnd_end();
|
|
if (error) file->print_error(error, MYF(0));
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
Generate a hash index for each row to quickly find duplicate rows.
|
|
|
|
@note
|
|
Note that this will not work on tables with blobs!
|
|
*/
|
|
|
|
static bool remove_dup_with_hash_index(THD *thd, TABLE *table,
|
|
Field **first_field,
|
|
const size_t *field_lengths,
|
|
size_t key_length, Item *having) {
|
|
uchar *record = table->record[0];
|
|
int error;
|
|
handler *file = table->file;
|
|
DBUG_TRACE;
|
|
|
|
MEM_ROOT mem_root(key_memory_hash_index_key_buffer, 32768);
|
|
memroot_unordered_set<std::string> hash(&mem_root);
|
|
hash.reserve(file->stats.records);
|
|
|
|
std::unique_ptr<uchar[]> key_buffer(new uchar[key_length]);
|
|
if ((error = file->ha_rnd_init(1))) goto err;
|
|
for (;;) {
|
|
uchar *key_pos = key_buffer.get();
|
|
if (thd->killed) {
|
|
thd->send_kill_message();
|
|
error = 0;
|
|
goto err;
|
|
}
|
|
if ((error = file->ha_rnd_next(record))) {
|
|
if (error == HA_ERR_RECORD_DELETED) continue;
|
|
if (error == HA_ERR_END_OF_FILE) break;
|
|
goto err;
|
|
}
|
|
if (!having_is_true(having)) {
|
|
if ((error = file->ha_delete_row(record))) goto err;
|
|
continue;
|
|
}
|
|
|
|
/* copy fields to key buffer */
|
|
const size_t *field_length = field_lengths;
|
|
for (Field **ptr = first_field; *ptr; ++ptr, ++field_length) {
|
|
if ((*ptr)->sort_key_is_varlen()) {
|
|
size_t len = (*ptr)->make_sort_key(key_pos + sizeof(uint32),
|
|
*field_length - sizeof(uint32));
|
|
int4store(key_pos, len);
|
|
key_pos += sizeof(uint32) + len;
|
|
} else {
|
|
size_t len MY_ATTRIBUTE((unused)) =
|
|
(*ptr)->make_sort_key(key_pos, *field_length);
|
|
DBUG_ASSERT(len == *field_length);
|
|
key_pos += *field_length;
|
|
}
|
|
}
|
|
|
|
if (!hash.insert(std::string(key_buffer.get(), key_pos)).second) {
|
|
// Duplicated record found; remove the row.
|
|
if ((error = file->ha_delete_row(record))) goto err;
|
|
}
|
|
}
|
|
|
|
(void)file->ha_rnd_end();
|
|
return false;
|
|
|
|
err:
|
|
if (file->inited) (void)file->ha_rnd_end();
|
|
if (error) file->print_error(error, MYF(0));
|
|
return true;
|
|
}
|
|
|
|
bool cp_buffer_from_ref(THD *thd, TABLE *table, TABLE_REF *ref) {
|
|
enum enum_check_fields save_check_for_truncated_fields =
|
|
thd->check_for_truncated_fields;
|
|
thd->check_for_truncated_fields = CHECK_FIELD_IGNORE;
|
|
my_bitmap_map *old_map = dbug_tmp_use_all_columns(table, table->write_set);
|
|
bool result = false;
|
|
|
|
for (uint part_no = 0; part_no < ref->key_parts; part_no++) {
|
|
store_key *s_key = ref->key_copy[part_no];
|
|
if (!s_key) continue;
|
|
|
|
/*
|
|
copy() can return STORE_KEY_OK even when there are errors so need to
|
|
check thd->is_error().
|
|
@todo This is due to missing handling of error return value from
|
|
Field::store().
|
|
*/
|
|
if (s_key->copy() != store_key::STORE_KEY_OK || thd->is_error()) {
|
|
result = true;
|
|
break;
|
|
}
|
|
}
|
|
thd->check_for_truncated_fields = save_check_for_truncated_fields;
|
|
dbug_tmp_restore_column_map(table->write_set, old_map);
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
allocate group fields or take prepared (cached).
|
|
|
|
@param main_join join of current select
|
|
@param curr_join current join (join of current select or temporary copy
|
|
of it)
|
|
|
|
@retval
|
|
0 ok
|
|
@retval
|
|
1 failed
|
|
*/
|
|
|
|
bool make_group_fields(JOIN *main_join, JOIN *curr_join) {
|
|
DBUG_TRACE;
|
|
if (main_join->group_fields_cache.elements) {
|
|
curr_join->group_fields = main_join->group_fields_cache;
|
|
curr_join->streaming_aggregation = true;
|
|
} else {
|
|
if (alloc_group_fields(curr_join, curr_join->group_list)) return 1;
|
|
main_join->group_fields_cache = curr_join->group_fields;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
Get a list of buffers for saveing last group.
|
|
|
|
Groups are saved in reverse order for easyer check loop.
|
|
*/
|
|
|
|
static bool alloc_group_fields(JOIN *join, ORDER *group) {
|
|
if (group) {
|
|
for (; group; group = group->next) {
|
|
Cached_item *tmp = new_Cached_item(join->thd, *group->item);
|
|
if (!tmp || join->group_fields.push_front(tmp)) return true;
|
|
}
|
|
}
|
|
join->streaming_aggregation = true; /* Mark for do_select */
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
Test if a single-row cache of items changed, and update the cache.
|
|
|
|
@details Test if a list of items that typically represents a result
|
|
row has changed. If the value of some item changed, update the cached
|
|
value for this item.
|
|
|
|
@param list list of <item, cached_value> pairs stored as Cached_item.
|
|
|
|
@return -1 if no item changed
|
|
@return index of the first item that changed
|
|
*/
|
|
|
|
int update_item_cache_if_changed(List<Cached_item> &list) {
|
|
DBUG_TRACE;
|
|
List_iterator<Cached_item> li(list);
|
|
int idx = -1, i;
|
|
Cached_item *buff;
|
|
|
|
for (i = (int)list.elements - 1; (buff = li++); i--) {
|
|
if (buff->cmp()) idx = i;
|
|
}
|
|
DBUG_PRINT("info", ("idx: %d", idx));
|
|
return idx;
|
|
}
|
|
|
|
/**
|
|
Sets up caches for holding the values of non-aggregated expressions. The
|
|
values are saved at the start of every new group.
|
|
|
|
This code path is used in the cases when aggregation can be performed
|
|
without a temporary table. Why it still uses a Temp_table_param is a
|
|
mystery.
|
|
|
|
Only FIELD_ITEM:s and FUNC_ITEM:s needs to be saved between groups.
|
|
Change old item_field to use a new field with points at saved fieldvalue
|
|
This function is only called before use of send_result_set_metadata.
|
|
|
|
@param all_fields all fields list; should really be const,
|
|
but Item does not always respect
|
|
constness
|
|
@param num_select_elements number of elements in select item list
|
|
@param thd THD pointer
|
|
@param [in,out] param temporary table parameters
|
|
@param [out] ref_item_array array of pointers to top elements of field
|
|
list
|
|
@param [out] res_selected_fields new list of items of select item list
|
|
@param [out] res_all_fields new list of all items
|
|
|
|
@todo
|
|
In most cases this result will be sent to the user.
|
|
This should be changed to use copy_int or copy_real depending
|
|
on how the value is to be used: In some cases this may be an
|
|
argument in a group function, like: IF(ISNULL(col),0,COUNT(*))
|
|
|
|
@returns false if success, true if error
|
|
*/
|
|
|
|
bool setup_copy_fields(List<Item> &all_fields, size_t num_select_elements,
|
|
THD *thd, Temp_table_param *param,
|
|
Ref_item_array ref_item_array,
|
|
List<Item> *res_selected_fields,
|
|
List<Item> *res_all_fields) {
|
|
DBUG_TRACE;
|
|
|
|
res_selected_fields->empty();
|
|
res_all_fields->empty();
|
|
size_t border = all_fields.size() - num_select_elements;
|
|
Memroot_vector<Item_copy *> extra_funcs(
|
|
Memroot_allocator<Item_copy *>(thd->mem_root));
|
|
|
|
param->grouped_expressions.clear();
|
|
DBUG_ASSERT(param->copy_fields.empty());
|
|
|
|
try {
|
|
param->grouped_expressions.reserve(all_fields.size());
|
|
param->copy_fields.reserve(param->field_count);
|
|
extra_funcs.reserve(border);
|
|
} catch (std::bad_alloc &) {
|
|
return true;
|
|
}
|
|
|
|
List_iterator_fast<Item> li(all_fields);
|
|
Item *pos;
|
|
for (size_t i = 0; (pos = li++); i++) {
|
|
Item *real_pos = pos->real_item();
|
|
if (real_pos->type() == Item::FIELD_ITEM) {
|
|
Item_field *item = new Item_field(thd, ((Item_field *)real_pos));
|
|
if (item == nullptr) return true;
|
|
if (pos->type() == Item::REF_ITEM) {
|
|
/* preserve the names of the ref when dereferncing */
|
|
Item_ref *ref = (Item_ref *)pos;
|
|
item->db_name = ref->db_name;
|
|
item->table_name = ref->table_name;
|
|
item->item_name = ref->item_name;
|
|
}
|
|
pos = item;
|
|
if (item->field->flags & BLOB_FLAG) {
|
|
Item_copy *item_copy = Item_copy::create(pos);
|
|
if (item_copy == nullptr) return true;
|
|
pos = item_copy;
|
|
/*
|
|
Item_copy_string::copy for function can call
|
|
Item_copy_string::val_int for blob via Item_ref.
|
|
But if Item_copy_string::copy for blob isn't called before,
|
|
it's value will be wrong
|
|
so let's insert Item_copy_string for blobs in the beginning of
|
|
copy_funcs
|
|
(to see full test case look at having.test, BUG #4358)
|
|
*/
|
|
param->grouped_expressions.push_back(item_copy);
|
|
} else {
|
|
/*
|
|
set up save buffer and change result_field to point at
|
|
saved value
|
|
*/
|
|
Field *field = item->field;
|
|
item->result_field = field->new_field(thd->mem_root, field->table, 1);
|
|
/*
|
|
We need to allocate one extra byte for null handling.
|
|
*/
|
|
uchar *tmp = new (*THR_MALLOC) uchar[field->pack_length() + 1];
|
|
if (tmp == nullptr) return true;
|
|
|
|
DBUG_ASSERT(param->field_count > param->copy_fields.size());
|
|
param->copy_fields.emplace_back(tmp, item->result_field);
|
|
item->result_field->move_field(param->copy_fields.back().to_ptr,
|
|
param->copy_fields.back().to_null_ptr,
|
|
1);
|
|
|
|
/*
|
|
We have created a new Item_field; its field points into the
|
|
previous table; its result_field points into a memory area
|
|
(REF_SLICE_ORDERED_GROUP_BY) which represents the pseudo-tmp-table
|
|
from where aggregates' values can be read. So does 'field'. A
|
|
Copy_field manages copying from 'field' to the memory area.
|
|
*/
|
|
item->field = item->result_field;
|
|
/*
|
|
Even though the field doesn't point into field->table->record[0], we
|
|
must still link it to 'table' through field->table because that's an
|
|
existing way to access some type info (e.g. nullability from
|
|
table->nullable).
|
|
*/
|
|
}
|
|
} else if (((real_pos->type() == Item::FUNC_ITEM ||
|
|
real_pos->type() == Item::SUBSELECT_ITEM ||
|
|
real_pos->type() == Item::CACHE_ITEM ||
|
|
real_pos->type() == Item::COND_ITEM) &&
|
|
!real_pos->has_aggregation() &&
|
|
!real_pos->has_rollup_expr())) { // Save for send fields
|
|
pos = real_pos;
|
|
/* TODO:
|
|
In most cases this result will be sent to the user.
|
|
This should be changed to use copy_int or copy_real depending
|
|
on how the value is to be used: In some cases this may be an
|
|
argument in a group function, like: IF(ISNULL(col),0,COUNT(*))
|
|
*/
|
|
Item_copy *item_copy = Item_copy::create(pos);
|
|
if (item_copy == nullptr) return true;
|
|
pos = item_copy;
|
|
if (i < border) // HAVING, ORDER and GROUP BY
|
|
extra_funcs.push_back(item_copy);
|
|
else
|
|
param->grouped_expressions.push_back(item_copy);
|
|
}
|
|
res_all_fields->push_back(pos);
|
|
ref_item_array[((i < border) ? all_fields.size() - i - 1 : i - border)] =
|
|
pos;
|
|
}
|
|
|
|
List_iterator_fast<Item> itr(*res_all_fields);
|
|
for (size_t i = 0; i < border; i++) itr++;
|
|
itr.sublist(*res_selected_fields, num_select_elements);
|
|
/*
|
|
Put elements from HAVING, ORDER BY and GROUP BY last to ensure that any
|
|
reference used in these will resolve to a item that is already calculated
|
|
*/
|
|
param->grouped_expressions.insert(param->grouped_expressions.end(),
|
|
extra_funcs.begin(), extra_funcs.end());
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
Make a copy of all simple SELECT'ed fields.
|
|
|
|
This is done at the start of a new group so that we can retrieve
|
|
these later when the group changes. It is also used in materialization,
|
|
to copy the values into the temporary table's fields.
|
|
|
|
@param param Represents the current temporary file being produced
|
|
@param thd The current thread
|
|
|
|
@returns false if OK, true on error.
|
|
*/
|
|
|
|
bool copy_fields(Temp_table_param *param, const THD *thd) {
|
|
DBUG_TRACE;
|
|
|
|
DBUG_PRINT("enter", ("for param %p", param));
|
|
for (Copy_field &ptr : param->copy_fields) ptr.invoke_do_copy(&ptr);
|
|
|
|
if (thd->is_error()) return true;
|
|
|
|
for (Item_copy *item : param->grouped_expressions) {
|
|
if (item->copy(thd)) return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool copy_fields_and_funcs(Temp_table_param *param, const THD *thd,
|
|
Copy_func_type type) {
|
|
if (copy_fields(param, thd)) return true;
|
|
if (param->items_to_copy != nullptr) {
|
|
if (copy_funcs(param, thd, type)) return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
Change all funcs and sum_funcs to fields in tmp table, and create
|
|
new list of all items.
|
|
|
|
@param all_fields all fields list; should really be const,
|
|
but Item does not always respect
|
|
constness
|
|
@param num_select_elements number of elements in select item list
|
|
@param thd THD pointer
|
|
@param [out] ref_item_array array of pointers to top elements of filed
|
|
list
|
|
@param [out] res_selected_fields new list of items of select item list
|
|
@param [out] res_all_fields new list of all items
|
|
|
|
@returns false if success, true if error
|
|
*/
|
|
|
|
bool change_to_use_tmp_fields(List<Item> &all_fields,
|
|
size_t num_select_elements, THD *thd,
|
|
Ref_item_array ref_item_array,
|
|
List<Item> *res_selected_fields,
|
|
List<Item> *res_all_fields) {
|
|
DBUG_TRACE;
|
|
|
|
res_selected_fields->empty();
|
|
res_all_fields->empty();
|
|
|
|
List_iterator_fast<Item> li(all_fields);
|
|
size_t border = all_fields.size() - num_select_elements;
|
|
Item *item;
|
|
for (size_t i = 0; (item = li++); i++) {
|
|
Item *item_field;
|
|
Field *field;
|
|
if (item->has_aggregation() && item->type() != Item::SUM_FUNC_ITEM)
|
|
item_field = item;
|
|
else if (item->type() == Item::FIELD_ITEM)
|
|
item_field = item->get_tmp_table_item(thd);
|
|
else if (item->type() == Item::FUNC_ITEM &&
|
|
((Item_func *)item)->functype() == Item_func::SUSERVAR_FUNC) {
|
|
field = item->get_tmp_table_field();
|
|
if (field != NULL) {
|
|
/*
|
|
Replace "@:=<expression>" with "@:=<tmp table column>". Otherwise, we
|
|
would re-evaluate <expression>, and if expression were a subquery,
|
|
this would access already-unlocked tables.
|
|
*/
|
|
Item_func_set_user_var *suv =
|
|
new Item_func_set_user_var(thd, (Item_func_set_user_var *)item);
|
|
Item_field *new_field = new Item_field(field);
|
|
if (!suv || !new_field) return true; // Fatal error
|
|
List<Item> list;
|
|
list.push_back(new_field);
|
|
suv->set_arguments(list, true);
|
|
item_field = suv;
|
|
} else
|
|
item_field = item;
|
|
} else if ((field = item->get_tmp_table_field())) {
|
|
if (item->type() == Item::SUM_FUNC_ITEM && field->table->group) {
|
|
item_field = down_cast<Item_sum *>(item)->result_item(field);
|
|
DBUG_ASSERT(item_field != nullptr);
|
|
} else {
|
|
item_field = new (thd->mem_root) Item_field(field);
|
|
if (item_field == nullptr) return true;
|
|
}
|
|
if (item->real_item()->type() != Item::FIELD_ITEM) field->orig_table = 0;
|
|
item_field->item_name = item->item_name;
|
|
if (item->type() == Item::REF_ITEM) {
|
|
Item_field *ifield = (Item_field *)item_field;
|
|
Item_ref *iref = (Item_ref *)item;
|
|
ifield->table_name = iref->table_name;
|
|
ifield->db_name = iref->db_name;
|
|
}
|
|
#ifndef DBUG_OFF
|
|
if (!item_field->item_name.is_set()) {
|
|
char buff[256];
|
|
String str(buff, sizeof(buff), &my_charset_bin);
|
|
str.length(0);
|
|
item->print(thd, &str, QT_ORDINARY);
|
|
item_field->item_name.copy(str.ptr(), str.length());
|
|
}
|
|
#endif
|
|
} else
|
|
item_field = item;
|
|
|
|
res_all_fields->push_back(item_field);
|
|
/*
|
|
Cf. comment explaining the reordering going on below in
|
|
similar section of change_refs_to_tmp_fields
|
|
*/
|
|
ref_item_array[((i < border) ? all_fields.size() - i - 1 : i - border)] =
|
|
item_field;
|
|
item_field->set_orig_field(item->get_orig_field());
|
|
}
|
|
|
|
List_iterator_fast<Item> itr(*res_all_fields);
|
|
for (size_t i = 0; i < border; i++) itr++;
|
|
itr.sublist(*res_selected_fields, num_select_elements);
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
Change all sum_func refs to fields to point at fields in tmp table.
|
|
Change all funcs to be fields in tmp table.
|
|
|
|
@param all_fields all fields list; should really be const,
|
|
but Item does not always respect
|
|
constness
|
|
@param num_select_elements number of elements in select item list
|
|
@param thd THD pointer
|
|
@param [out] ref_item_array array of pointers to top elements of filed
|
|
list
|
|
@param [out] res_selected_fields new list of items of select item list
|
|
@param [out] res_all_fields new list of all items
|
|
|
|
@returns false if success, true if error
|
|
*/
|
|
|
|
bool change_refs_to_tmp_fields(List<Item> &all_fields,
|
|
size_t num_select_elements, THD *thd,
|
|
Ref_item_array ref_item_array,
|
|
List<Item> *res_selected_fields,
|
|
List<Item> *res_all_fields) {
|
|
DBUG_TRACE;
|
|
res_selected_fields->empty();
|
|
res_all_fields->empty();
|
|
|
|
List_iterator_fast<Item> li(all_fields);
|
|
size_t border = all_fields.size() - num_select_elements;
|
|
Item *item;
|
|
for (size_t i = 0; (item = li++); i++) {
|
|
/*
|
|
Below we create "new_item" using get_tmp_table_item
|
|
based on all_fields[i] and assign them to res_all_fields[i].
|
|
|
|
The new items are also put into ref_item_array, but in another order,
|
|
cf the diagram below.
|
|
|
|
Example of the population of ref_item_array, ref_all_fields and
|
|
res_selected_fields based on all_fields:
|
|
|
|
res_all_fields res_selected_fields
|
|
| |
|
|
V V
|
|
+--+ +--+ +--+ +--+ +--+ +--+ +--+
|
|
|0 |-->| |-->| |-->|3 |-->|4 |-->| |--> .. -->|9 |
|
|
+--+ +--+ +--+ +--+ +--+ +--+ +--+
|
|
| |
|
|
,------------->--------\----/
|
|
| |
|
|
+-^-+---+---+---+---+---#-^-+---+---+---+
|
|
| | | | | | # | | | | ref_item_array
|
|
+---+---+---+---+---+---#---+---+---+---+
|
|
4 5 6 7 8 9 3 2 1 0 position in all_fields list
|
|
similar to ref_all_fields pos
|
|
all_fields.elements == 10 border == 4
|
|
(visible) elements == 6
|
|
|
|
i==0 -> afe-0-1 == 9 i==4 -> 4-4 == 0
|
|
i==1 -> afe-1-1 == 8 :
|
|
i==2 -> afe-2-1 == 7
|
|
i==3 -> afe-3-1 == 6 i==9 -> 9-4 == 5
|
|
*/
|
|
Item *new_item = item->get_tmp_table_item(thd);
|
|
res_all_fields->push_back(new_item);
|
|
ref_item_array[((i < border) ? all_fields.size() - i - 1 : i - border)] =
|
|
new_item;
|
|
}
|
|
|
|
List_iterator_fast<Item> itr(*res_all_fields);
|
|
for (size_t i = 0; i < border; i++) itr++;
|
|
itr.sublist(*res_selected_fields, num_select_elements);
|
|
|
|
return thd->is_fatal_error();
|
|
}
|
|
|
|
/**
|
|
Clear all result fields. Non-aggregated fields are set to NULL,
|
|
aggregated fields are set to their special "clear" value.
|
|
|
|
Result fields can be fields from input tables, field values generated
|
|
by sum functions and literal values.
|
|
|
|
This is used when no rows are found during grouping: for FROM clause, a
|
|
result row of all NULL values will be output; then SELECT list expressions
|
|
get evaluated. E.g. SUM() will be NULL (the special "clear" value) and thus
|
|
SUM() IS NULL will be true.
|
|
|
|
@note Setting field values for input tables is a destructive operation,
|
|
since it overwrite the NULL value flags with 1 bits. Rows from
|
|
const tables are never re-read, hence their NULL value flags must
|
|
be saved by this function and later restored by JOIN::restore_fields().
|
|
This is generally not necessary for non-const tables, since field
|
|
values are overwritten when new rows are read.
|
|
|
|
@param[out] save_nullinfo Map of tables whose fields were set to NULL,
|
|
and for which NULL values must be restored.
|
|
Should be set to all zeroes on entry to function.
|
|
|
|
@returns false if success, true if error
|
|
*/
|
|
|
|
bool JOIN::clear_fields(table_map *save_nullinfo) {
|
|
// Set all column values from all input tables to NULL.
|
|
for (uint tableno = 0; tableno < primary_tables; tableno++) {
|
|
QEP_TAB *const tab = qep_tab + tableno;
|
|
TABLE *const table = tab->table_ref->table;
|
|
if (!table->has_null_row()) {
|
|
*save_nullinfo |= tab->table_ref->map();
|
|
if (table->const_table) table->save_null_flags();
|
|
table->set_null_row(); // All fields are NULL
|
|
}
|
|
}
|
|
if (copy_fields(&tmp_table_param, thd)) return true;
|
|
|
|
if (sum_funcs) {
|
|
Item_sum *func, **func_ptr = sum_funcs;
|
|
while ((func = *(func_ptr++))) func->clear();
|
|
}
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
Restore all result fields for all tables specified in save_nullinfo.
|
|
|
|
@param save_nullinfo Set of tables for which restore is necessary.
|
|
|
|
@note Const tables must have their NULL value flags restored,
|
|
@see JOIN::clear_fields().
|
|
*/
|
|
void JOIN::restore_fields(table_map save_nullinfo) {
|
|
DBUG_ASSERT(save_nullinfo);
|
|
|
|
for (uint tableno = 0; tableno < primary_tables; tableno++) {
|
|
QEP_TAB *const tab = qep_tab + tableno;
|
|
if (save_nullinfo & tab->table_ref->map()) {
|
|
TABLE *const table = tab->table_ref->table;
|
|
if (table->const_table) table->restore_null_flags();
|
|
table->reset_null_row();
|
|
}
|
|
}
|
|
}
|
|
|
|
/****************************************************************************
|
|
QEP_tmp_table implementation
|
|
****************************************************************************/
|
|
|
|
/**
|
|
@brief Instantiate tmp table and start index scan if necessary
|
|
@todo Tmp table always would be created, even for empty result. Extend
|
|
executor to avoid tmp table creation when no rows were written
|
|
into tmp table.
|
|
@return
|
|
true error
|
|
false ok
|
|
*/
|
|
|
|
bool QEP_tmp_table::prepare_tmp_table() {
|
|
DBUG_TRACE;
|
|
Temp_table_param *const tmp_tbl = qep_tab->tmp_table_param;
|
|
|
|
/*
|
|
Window final tmp file optimization: we skip actually writing to the
|
|
tmp file, so no need to physically create it.
|
|
*/
|
|
if (tmp_tbl->m_window_short_circuit) return false;
|
|
|
|
TABLE *table = qep_tab->table();
|
|
JOIN *join = qep_tab->join();
|
|
int rc = 0;
|
|
|
|
if (!table->is_created()) {
|
|
if (instantiate_tmp_table(join->thd, table)) return true;
|
|
empty_record(table);
|
|
}
|
|
/* If it wasn't already, start index scan for grouping using table index. */
|
|
if (!table->file->inited &&
|
|
((table->group && tmp_tbl->sum_func_count && table->s->keys) ||
|
|
table->hash_field))
|
|
rc = table->file->ha_index_init(0, 0);
|
|
else {
|
|
/* Start index scan in scanning mode */
|
|
rc = table->file->ha_rnd_init(true);
|
|
}
|
|
if (rc) {
|
|
table->file->print_error(rc, MYF(0));
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
@brief Prepare table if necessary and call write_func to save record
|
|
|
|
@param end_of_records The end_of_record signal to pass to the writer
|
|
|
|
@return return one of enum_nested_loop_state.
|
|
*/
|
|
|
|
enum_nested_loop_state QEP_tmp_table::put_record(bool end_of_records) {
|
|
// Lasy tmp table creation/initialization
|
|
if (!qep_tab->table()->file->inited && prepare_tmp_table())
|
|
return NESTED_LOOP_ERROR;
|
|
enum_nested_loop_state rc =
|
|
(*write_func)(qep_tab->join(), qep_tab, end_of_records);
|
|
return rc;
|
|
}
|
|
|
|
/**
|
|
@brief Finish rnd/index scan after accumulating records, switch ref_array,
|
|
and send accumulated records further.
|
|
@return return one of enum_nested_loop_state.
|
|
*/
|
|
|
|
enum_nested_loop_state QEP_tmp_table::end_send() {
|
|
enum_nested_loop_state rc = NESTED_LOOP_OK;
|
|
TABLE *table = qep_tab->table();
|
|
JOIN *join = qep_tab->join();
|
|
|
|
// All records were stored, send them further
|
|
int tmp, new_errno = 0;
|
|
|
|
if ((rc = put_record(true)) < NESTED_LOOP_OK) return rc;
|
|
|
|
if ((tmp = table->file->ha_index_or_rnd_end())) {
|
|
DBUG_PRINT("error", ("ha_index_or_rnd_end() failed"));
|
|
new_errno = tmp;
|
|
}
|
|
if (new_errno) {
|
|
table->file->print_error(new_errno, MYF(0));
|
|
return NESTED_LOOP_ERROR;
|
|
}
|
|
table->reginfo.lock_type = TL_UNLOCK;
|
|
|
|
if (join->m_windows.elements > 0)
|
|
join->thd->get_stmt_da()->reset_current_row_for_condition();
|
|
|
|
Temp_table_param *const tmp_tbl = qep_tab->tmp_table_param;
|
|
|
|
/**
|
|
Window final tmp file optimization:
|
|
rows have already been sent from end_write, so just return.
|
|
*/
|
|
if (tmp_tbl->m_window_short_circuit) return NESTED_LOOP_OK;
|
|
|
|
Switch_ref_item_slice slice_switch(join, qep_tab->ref_item_slice);
|
|
|
|
bool in_first_read = true;
|
|
while (rc == NESTED_LOOP_OK) {
|
|
if (in_first_read) {
|
|
in_first_read = false;
|
|
|
|
if (qep_tab->needs_duplicate_removal && qep_tab->remove_duplicates()) {
|
|
rc = NESTED_LOOP_ERROR;
|
|
break;
|
|
}
|
|
|
|
// The same temporary table can be used multiple times (with different
|
|
// data, e.g. for a dependent subquery). To avoid leaks, we need to make
|
|
// sure we clean up any existing streams here, as join_setup_iterator
|
|
// assumes the memory is unused.
|
|
qep_tab->iterator.reset();
|
|
|
|
join_setup_iterator(qep_tab);
|
|
if (qep_tab->iterator->Init()) {
|
|
rc = NESTED_LOOP_ERROR;
|
|
break;
|
|
}
|
|
}
|
|
|
|
int error = qep_tab->iterator->Read();
|
|
if (error > 0 || (join->thd->is_error())) // Fatal error
|
|
rc = NESTED_LOOP_ERROR;
|
|
else if (error < 0)
|
|
break;
|
|
else if (join->thd->killed) // Aborted by user
|
|
{
|
|
join->thd->send_kill_message();
|
|
rc = NESTED_LOOP_KILLED;
|
|
} else
|
|
rc = evaluate_join_record(join, qep_tab);
|
|
}
|
|
|
|
// Finish rnd scn after sending records
|
|
if (table->file->inited) table->file->ha_rnd_end();
|
|
|
|
return rc;
|
|
}
|
|
|
|
/******************************************************************************
|
|
Code for pfs_batch_update
|
|
******************************************************************************/
|
|
|
|
bool QEP_TAB::pfs_batch_update(JOIN *join) const {
|
|
/*
|
|
Use PFS batch mode unless
|
|
1. tab is not an inner-most table, or
|
|
2. a table has eq_ref or const access type, or
|
|
3. this tab contains a subquery that accesses one or more tables
|
|
*/
|
|
|
|
return !((join->qep_tab + join->primary_tables - 1) != this || // 1
|
|
this->type() == JT_EQ_REF || // 2
|
|
this->type() == JT_CONST || this->type() == JT_SYSTEM ||
|
|
(condition() && condition()->has_subquery())); // 3
|
|
}
|
|
|
|
/**
|
|
@} (end of group Query_Executor)
|
|
*/
|
|
|
|
vector<string> UnqualifiedCountIterator::DebugString() const {
|
|
return {"Count rows in " + string(m_join->qep_tab->table()->alias)};
|
|
}
|
|
|
|
int UnqualifiedCountIterator::Read() {
|
|
if (!m_has_row) {
|
|
return -1;
|
|
}
|
|
|
|
for (Item &item : m_join->all_fields) {
|
|
if (item.type() == Item::SUM_FUNC_ITEM &&
|
|
down_cast<Item_sum &>(item).sum_func() == Item_sum::COUNT_FUNC) {
|
|
int error;
|
|
ulonglong count = get_exact_record_count(m_join->qep_tab,
|
|
m_join->primary_tables, &error);
|
|
if (error) return 1;
|
|
|
|
down_cast<Item_sum_count &>(item).make_const(
|
|
static_cast<longlong>(count));
|
|
}
|
|
}
|
|
|
|
// If we are outputting to a temporary table, we need to copy the results
|
|
// into it here. It is also used for nonaggregated items, even when there are
|
|
// no temporary tables involved.
|
|
if (copy_fields_and_funcs(&m_join->tmp_table_param, m_join->thd)) {
|
|
return 1;
|
|
}
|
|
|
|
m_has_row = false;
|
|
return 0;
|
|
}
|
|
|
|
int ZeroRowsAggregatedIterator::Read() {
|
|
if (!m_has_row) {
|
|
return -1;
|
|
}
|
|
|
|
// Mark tables as containing only NULL values
|
|
for (TABLE_LIST *table = m_join->select_lex->leaf_tables; table;
|
|
table = table->next_leaf) {
|
|
table->table->set_null_row();
|
|
}
|
|
|
|
// Calculate aggregate functions for no rows
|
|
|
|
/*
|
|
Must notify all fields that there are no rows (not only those
|
|
that will be returned) because join->having may refer to
|
|
fields that are not part of the result columns.
|
|
*/
|
|
for (Item &item : m_join->all_fields) {
|
|
item.no_rows_in_result();
|
|
}
|
|
|
|
m_has_row = false;
|
|
if (m_examined_rows != nullptr) {
|
|
++*m_examined_rows;
|
|
}
|
|
return 0;
|
|
}
|
|
|