diff options
Diffstat (limited to 'storage/perfschema/pfs.cc')
-rw-r--r-- | storage/perfschema/pfs.cc | 2053 |
1 files changed, 2053 insertions, 0 deletions
diff --git a/storage/perfschema/pfs.cc b/storage/perfschema/pfs.cc new file mode 100644 index 00000000000..01b4b3711c1 --- /dev/null +++ b/storage/perfschema/pfs.cc @@ -0,0 +1,2053 @@ +/* Copyright (C) 2008-2009 Sun Microsystems, Inc + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/** + @file storage/perfschema/pfs.cc + The performance schema implementation of all instruments. +*/ + +#include "my_global.h" +#include "pfs.h" +#include "pfs_instr_class.h" +#include "pfs_instr.h" +#include "pfs_global.h" +#include "pfs_column_values.h" +#include "pfs_timer.h" +#include "pfs_events_waits.h" + +/* Pending WL#4895 PERFORMANCE_SCHEMA Instrumenting Table IO */ +#undef HAVE_TABLE_WAIT + +/** + @page PAGE_PERFORMANCE_SCHEMA The Performance Schema main page + MySQL PERFORMANCE_SCHEMA implementation. + + @section INTRO Introduction + The PERFORMANCE_SCHEMA is a way to introspect the internal execution of + the server at runtime. + The performance schema focuses primarily on performance data, + as opposed to the INFORMATION_SCHEMA whose purpose is to inspect metadata. + + From a user point of view, the performance schema consists of: + - a dedicated database schema, named PERFORMANCE_SCHEMA, + - SQL tables, used to query the server internal state or change + configuration settings. + + From an implementation point of view, the performance schema is a dedicated + Storage Engine which exposes data collected by 'Instrumentation Points' + placed in the server code. + + @section INTERFACES Multiple interfaces + + The performance schema exposes many different interfaces, + for different components, and for different purposes. + + @subsection INT_INSTRUMENTING Instrumenting interface + + All the data representing the server internal state exposed + in the performance schema must be first collected: + this is the role of the instrumenting interface. + The instrumenting interface is a coding interface provided + by implementors (of the performance schema) to implementors + (of the server or server components). + + This interface is available to: + - C implementations + - C++ implementations + - the core SQL layer (/sql) + - the mysys library (/mysys) + - MySQL plugins, including storage engines, + - third party plugins, including third party storage engines. + + For details, see the @ref PAGE_INSTRUMENTATION_INTERFACE + "instrumentation interface page". + + @subsection INT_COMPILING Compiling interface + + The implementation of the performance schema can be enabled or disabled at + build time, when building MySQL from the source code. + + When building with the performance schema code, some compilation flags + are available to change the default values used in the code, if required. + + For more details, see: + @verbatim ./configure --help @endverbatim + + To compile with the performance schema: + @verbatim ./configure --with-perfschema @endverbatim + + The implementation of all the compiling options is located in + @verbatim ./storage/perfschema/plug.in @endverbatim + + @subsection INT_STARTUP Server startup interface + + The server startup interface consists of the "./mysqld ..." + command line used to start the server. + When the performance schema is compiled in the server binary, + extra command line options are available. + + These extra start options allow the DBA to: + - enable or disable the performance schema + - specify some sizing parameters. + + To see help for the performance schema startup options, see: + @verbatim ./sql/mysqld --verbose --help @endverbatim + + The implementation of all the startup options is located in + @verbatim ./sql/mysqld.cc, my_long_options[] @endverbatim + + @subsection INT_BOOTSTRAP Server bootstrap interface + + The bootstrap interface is a private interface exposed by + the performance schema, and used by the SQL layer. + Its role is to advertise all the SQL tables natively + supported by the performance schema to the SQL server. + The code consists of creating MySQL tables for the + performance schema itself, and is used in './mysql --bootstrap' + mode when a server is installed. + + The implementation of the database creation script is located in + @verbatim ./scripts/mysql_system_tables.sql @endverbatim + + @subsection INT_CONFIG Runtime configuration interface + + When the performance schema is used at runtime, various configuration + parameters can be used to specify what kind of data is collected, + what kind of aggregations are computed, what kind of timers are used, + what events are timed, etc. + + For all these capabilities, not a single statement or special syntax + was introduced in the parser. + Instead of new SQL statements, the interface consists of DML + (SELECT, INSERT, UPDATE, DELETE) against special "SETUP" tables. + + For example: + @verbatim mysql> update performance_schema.SETUP_INSTRUMENTS + set ENABLED='YES', TIMED='YES'; + Query OK, 234 rows affected (0.00 sec) + Rows matched: 234 Changed: 234 Warnings: 0 @endverbatim + + @subsection INT_STATUS Internal audit interface + + The internal audit interface is provided to the DBA to inspect if the + performance schema code itself is functioning properly. + This interface is necessary because a failure caused while + instrumenting code in the server should not cause failures in the + MySQL server itself, so that the performance schema implementation + never raises errors during runtime execution. + + This auditing interface consists of: + @verbatim SHOW ENGINE PERFORMANCE_SCHEMA STATUS; @endverbatim + It displays data related to the memory usage of the performance schema, + as well as statistics about lost events, if any. + + The SHOW STATUS command is implemented in + @verbatim ./storage/perfschema/pfs_engine_table.cc @endverbatim + + @subsection INT_QUERY Query interface + + The query interface is used to query the internal state of a running server. + It is provided as SQL tables. + + For example: + @verbatim mysql> select * from performance_schema.EVENTS_WAITS_CURRENT; + @endverbatim + + @section DESIGN_PRINCIPLES Design principles + + @subsection PRINCIPLE_BEHAVIOR No behavior changes + + The primary goal of the performance schema is to measure (instrument) the + execution of the server. A good measure should not cause any change + in behavior. + + To achieve this, the overall design of the performance schema complies + with the following very severe design constraints: + + The parser is unchanged. There are no new keywords, no new statements. + This guarantees that existing applications will run the same way with or + without the performance schema. + + All the instrumentation points return "void", there are no error codes. + Even if the performance schema internally fails, execution of the server + code will proceed. + + None of the instrumentation points allocate memory. + All the memory used by the performance schema is pre-allocated at startup, + and is considered "static" during the server life time. + + None of the instrumentation points use any pthread_mutex, pthread_rwlock, + or pthread_cond (or platform equivalents). + Executing the instrumentation point should not cause thread scheduling to + change in the server. + + In other words, the implementation of the instrumentation points, + including all the code called by the instrumentation points, is: + - malloc free + - mutex free + - rwlock free + + TODO: All the code located in storage/perfschema is malloc free, + but unfortunately the usage of LF_HASH introduces some memory allocation. + This should be revised if possible, to use a lock-free, + malloc-free hash code table. + + @subsection PRINCIPLE_PERFORMANCE No performance hit + + The instrumentation of the server should be as fast as possible. + In cases when there are choices between: + - doing some processing when recording the performance data + in the instrumentation, + - doing some processing when retrieving the performance data, + + priority is given in the design to make the instrumentation faster, + pushing some complexity to data retrieval. + + As a result, some parts of the design, related to: + - the setup code path, + - the query code path, + + might appear to be sub-optimal. + + The criterion used here is to optimize primarily the critical path (data + collection), possibly at the expense of non-critical code paths. + + @subsection PRINCIPLE_NOT_INTRUSIVE Unintrusive instrumentation + + For the performance schema in general to be successful, the barrier + of entry for a developer should be low, so it's easy to instrument code. + + In particular, the instrumentation interface: + - is available for C and C++ code (so it's a C interface), + - does not require parameters that the calling code can't easily provide, + - supports partial instrumentation (for example, instrumenting mutexes does + not require that every mutex is instrumented) + + @subsection PRINCIPLE_EXTENDABLE Extendable instrumentation + + As the content of the performance schema improves, + with more tables exposed and more data collected, + the instrumentation interface will also be augmented + to support instrumenting new concepts. + Existing instrumentations should not be affected when additional + instrumentation is made available, and making a new instrumentation + available should not require existing instrumented code to support it. + + @subsection PRINCIPLE_VERSIONED Versioned instrumentation + + Given that the instrumentation offered by the performance schema will + be augmented with time, when more features are implemented, + the interface itself should be versioned, to keep compatibility + with previous instrumented code. + + For example, after both plugin-A and plugin-B have been instrumented for + mutexes, read write locks and conditions, using the instrumentation + interface, we can anticipate that the instrumentation interface + is expanded to support file based operations. + + Plugin-A, a file based storage engine, will most likely use the expanded + interface and instrument its file usage, using the version 2 + interface, while Plugin-B, a network based storage engine, will not change + its code and not release a new binary. + + When later the instrumentation interface is expanded to support network + based operations (which will define interface version 3), the Plugin-B code + can then be changed to make use of it. + + Note, this is just an example to illustrate the design concept here. + Both mutexes and file instrumentation are already available + since version 1 of the instrumentation interface. + + @subsection PRINCIPLE_DEPLOYMENT Easy deployment + + Internally, we might want every plugin implementation to upgrade the + instrumented code to the latest available, but this will cause additional + work and this is not practical if the code change is monolithic. + + Externally, for third party plugin implementors, asking implementors to + always stay aligned to the latest instrumentation and make new releases, + even when the change does not provide new functionality for them, + is a bad idea. + + For example, requiring a network based engine to re-release because the + instrumentation interface changed for file based operations, will create + too many deployment issues. + + So, the performance schema implementation must support concurrently, + in the same deployment, multiple versions of the instrumentation + interface, and ensure binary compatibility with each version. + + In addition to this, the performance schema can be included or excluded + from the server binary, using build time configuration options. + + Regardless, the following types of deployment are valid: + - a server supporting the performance schema + a storage engine + that is not instrumented + - a server not supporting the performance schema + a storage engine + that is instrumented +*/ + +/** + @page PAGE_INSTRUMENTATION_INTERFACE + Performance schema: instrumentation interface page. + MySQL performance schema instrumentation interface. + + @section INTRO Introduction + + The instrumentation interface consist of two layers: + - a raw ABI (Application Binary Interface) layer, that exposes the primitive + instrumentation functions exported by the performance schema instrumentation + - an API (Application Programing Interface) layer, + that provides many helpers for a developer instrumenting some code, + to make the instrumentation as easy as possible. + + The ABI layer consists of: +@code +#include "mysql/psi/psi.h" +@endcode + + The API layer consists of: +@code +#include "mysql/psi/mutex_mutex.h" +#include "mysql/psi/mutex_file.h" +@endcode + + The first helper is for mutexes, rwlocks and conditions, + the second for file io. + + The API layer exposes C macros and typedefs which will expand: + - either to non-instrumented code, when compiled without the performance + schema instrumentation + - or to instrumented code, that will issue the raw calls to the ABI layer + so that the implementation can collect data. + + Note that all the names introduced (for example, @c mysql_mutex_lock) do not + collide with any other namespace. + In particular, the macro @c mysql_mutex_lock is on purpose not named + @c pthread_mutex_lock. + This is to: + - avoid overloading @c pthread_mutex_lock with yet another macro, + which is dangerous as it can affect user code and pollute + the end-user namespace. + - allow the developer instrumenting code to selectively instrument + some code but not all. + + @section PRINCIPLES Design principles + + The ABI part is designed as a facade, that exposes basic primitives. + The expectation is that each primitive will be very stable over time, + but the list will constantly grow when more instruments are supported. + To support binary compatibility with plugins compiled with a different + version of the instrumentation, the ABI itself is versioned + (see @c PSI_v1, @c PSI_v2). + + For a given instrumentation point in the API, the basic coding pattern + used is: + - (a) If the performance schema is not initialized, do nothing + - (b) If the object acted upon is not instrumented, do nothing + - (c) otherwise, notify the performance schema of the operation + about to be performed. + + The implementation of the instrumentation interface can: + - decide that it is not interested by the event, and return NULL. + In this context, 'interested' means whether the instrumentation for + this object + event is turned on in the performance schema configuration + (the SETUP_ tables). + - decide that this event is to be instrumented. + In this case, the instrumentation returns an opaque pointer, + that acts as a listener. + + If a listener is returned, the instrumentation point then: + - (d) invokes the "start" event method + - (e) executes the instrumented code. + - (f) invokes the "end" event method. + + If no listener is returned, only the instrumented code (e) is invoked. + + The following code fragment is annotated to show how in detail this pattern + in implemented, when the instrumentation is compiled in: + +@verbatim +static inline int mysql_mutex_lock( + mysql_mutex_t *that, myf flags, const char *src_file, uint src_line) +{ + int result; + struct PSI_mutex_locker *locker= NULL; + + ...... (a) .......... (b) + if (PSI_server && that->m_psi) + + .......................... (c) + if ((locker= PSI_server->get_thread_mutex_locker(that->m_psi, + PSI_MUTEX_LOCK))) + + ............... (d) + PSI_server->start_mutex_wait(locker, src_file, src_line); + + ........ (e) + result= pthread_mutex_lock(&that->m_mutex); + + if (locker) + + ............. (f) + PSI_server->end_mutex_wait(locker, result); + + return result; +} +@endverbatim + + When the performance schema instrumentation is not compiled in, + the code becomes simply a wrapper, expanded in line by the compiler: + +@verbatim +static inline int mysql_mutex_lock(...) +{ + int result; + + ........ (e) + result= pthread_mutex_lock(&that->m_mutex); + + return result; +} +@endverbatim +*/ + +/** + @page PAGE_AGGREGATES Performance schema: the aggregates page. + Performance schema aggregates. + + @section INTRO Introduction + + Aggregates tables are tables that can be formally defined as + SELECT ... from EVENTS_WAITS_HISTORY_INFINITE ... group by 'group clause'. + + Each group clause defines a different kind of aggregate, and corresponds to + a different table exposed by the performance schema. + + Aggregates can be either: + - computed on the fly, + - computed on demand, based on other available data. + + 'EVENTS_WAITS_HISTORY_INFINITE' is a table that does not exist, + the best approximation is EVENTS_WAITS_HISTORY_LONG. + Aggregates computed on the fly in fact are based on EVENTS_WAITS_CURRENT, + while aggregates computed on demand are based on other + EVENTS_WAITS_SUMMARY_BY_xxx tables. + + To better understand the implementation itself, a bit of math is + required first, to understand the model behind the code: + the code is deceptively simple, the real complexity resides + in the flyweight of pointers between various performance schema buffers. + + @section DIMENSION Concept of dimension + + An event measured by the instrumentation has many attributes. + An event is represented as a data point P(x1, x2, ..., xN), + where each x_i coordinate represents a given attribute value. + + Examples of attributes are: + - the time waited + - the object waited on + - the instrument waited on + - the thread that waited + - the operation performed + - per object or per operation additional attributes, such as spins, + number of bytes, etc. + + Computing an aggregate per thread is fundamentally different from + computing an aggregate by instrument, so the "_BY_THREAD" and + "_BY_EVENT_NAME" aggregates are different dimensions, + operating on different x_i and x_j coordinates. + These aggregates are "orthogonal". + + @section PROJECTION Concept of projection + + A given x_i attribute value can convey either just one basic information, + such as a number of bytes, or can convey implied information, + such as an object fully qualified name. + + For example, from the value "test.t1", the name of the object schema + "test" can be separated from the object name "t1", so that now aggregates + by object schema can be implemented. + + In math terms, that corresponds to defining a function: + F_i (x): x --> y + Applying this function to our point P gives another point P': + + F_i (P): + P(x1, x2, ..., x{i-1}, x_i, x{i+1}, ..., x_N + --> P' (x1, x2, ..., x{i-1}, f_i(x_i), x{i+1}, ..., x_N) + + That function defines in fact an aggregate ! + In SQL terms, this aggregate would look like the following table: + +@verbatim + CREATE VIEW EVENTS_WAITS_SUMMARY_BY_Func_i AS + SELECT col_1, col_2, ..., col_{i-1}, + Func_i(col_i), + COUNT(col_i), + MIN(col_i), AVG(col_i), MAX(col_i), -- if col_i is a numeric value + col_{i+1}, ..., col_N + FROM EVENTS_WAITS_HISTORY_INFINITE + group by col_1, col_2, ..., col_{i-1}, col{i+1}, ..., col_N. +@endverbatim + + Note that not all columns have to be included, + in particular some columns that are dependent on the x_i column should + be removed, so that in practice, MySQL's aggregation method tends to + remove many attributes at each aggregation steps. + + For example, when aggregating wait events by object instances, + - the wait_time and number_of_bytes can be summed, + and sum(wait_time) now becomes an object instance attribute. + - the source, timer_start, timer_end columns are not in the + _BY_INSTANCE table, because these attributes are only + meaningful for a wait. + + @section COMPOSITION Concept of composition + + Now, the "test.t1" --> "test" example was purely theory, + just to explain the concept, and does not lead very far. + Let's look at a more interesting example of data that can be derived + from the row event. + + An event creates a transient object, PFS_wait_locker, per operation. + This object's life cycle is extremely short: it's created just + before the start_wait() instrumentation call, and is destroyed in + the end_wait() call. + + The wait locker itself contains a pointer to the object instance + waited on. + That allows to implement a wait_locker --> object instance projection, + with m_target. + The object instance life cycle depends on _init and _destroy calls + from the code, such as mysql_mutex_init() + and mysql_mutex_destroy() for a mutex. + + The object instance waited on contains a pointer to the object class, + which is represented by the instrument name. + That allows to implement an object instance --> object class projection. + The object class life cycle is permanent, as instruments are loaded in + the server and never removed. + + The object class is named in such a way + (for example, "wait/sync/mutex/sql/LOCK_open", + "wait/io/file/maria/data_file) that the component ("sql", "maria") + that it belongs to can be inferred. + That allows to implement an object class --> server component projection. + + Back to math again, we have, for example for mutexes: + + F1 (l) : PFS_wait_locker l --> PFS_mutex m = l->m_target.m_mutex + + F1_to_2 (m) : PFS_mutex m --> PFS_mutex_class i = m->m_class + + F2_to_3 (i) : PFS_mutex_class i --> const char *component = + substring(i->m_name, ...) + + Per components aggregates are not implemented, this is just an illustration. + + F1 alone defines this aggregate: + + EVENTS_WAITS_HISTORY_INFINITE --> EVENTS_WAITS_SUMMARY_BY_INSTANCE + (or MUTEX_INSTANCE) + + F1_to_2 alone could define this aggregate: + + EVENTS_WAITS_SUMMARY_BY_INSTANCE --> EVENTS_WAITS_SUMMARY_BY_EVENT_NAME + + Alternatively, using function composition, with + F2 = F1_to_2 o F1, F2 defines: + + EVENTS_WAITS_HISTORY_INFINITE --> EVENTS_WAITS_SUMMARY_BY_EVENT_NAME + + Likewise, F_2_to_3 defines: + + EVENTS_WAITS_SUMMARY_BY_EVENT_NAME --> EVENTS_WAITS_SUMMARY_BY_COMPONENT + + and F3 = F_2_to_3 o F_1_to_2 o F1 defines: + + EVENTS_WAITS_HISTORY_INFINITE --> EVENTS_WAITS_SUMMARY_BY_COMPONENT + + What has all this to do with the code ? + + Function composition such as F_2_to_3 o F_1_to_2 o F1 is implemented + as PFS_single_stat_chain, where each link in the chain represents + an individual F_{i}_to_{i+1} aggregation step. + + A single call to aggregate_single_stat_chain() updates all the tables + described in the statistics chain. + + @section STAT_CHAIN Statistics chains + + Statistics chains are only used for on the fly aggregates, + and are therefore all based initially on the '_CURRENT' base table that + contains the data recorded. + The following table aggregates are implemented with a statistics chain: + + EVENTS_WAITS_CURRENT --> EVENTS_WAITS_SUMMARY_BY_INSTANCE + --> EVENTS_WAITS_SUMMARY_BY_EVENT_NAME + + This relationship is between classes. + + In terms of object instances, or records, this chain is implemented + as a flyweight. + + For example, assuming the following scenario: + - A mutex class "M" is instrumented, the instrument name + is "wait/sync/mutex/sql/M" + - This mutex instrument has been instantiated twice, + mutex instances are noted M-1 and M-2 + - Threads T-A and T-B are locking mutex instance M-1 + - Threads T-C and T-D are locking mutex instance M-2 + + The performance schema will record the following data: + - EVENTS_WAITS_CURRENT has 4 rows, one for each mutex locker + - EVENTS_WAITS_SUMMARY_BY_INSTANCE shows 2 rows, for M-1 and M-2 + - EVENTS_WAITS_SUMMARY_BY_EVENT_NAME shows 1 row, for M + + The graph of structures will look like: + +@verbatim + PFS_wait_locker (T-A, M-1) ---------- + | + v + PFS_mutex (M-1) + - m_wait_stat ------------ + ^ | + | | + PFS_wait_locker (T-B, M-1) ---------- | + v + PFS_mutex_class (M) + - m_wait_stat + PFS_wait_locker (T-C, M-2) ---------- ^ + | | + v | + PFS_mutex (M-2) | + - m_wait_stat ------------ + ^ + | + PFS_wait_locker (T-D, M-2) ---------- + + || || || + || || || + vv vv vv + + EVENTS_WAITS_CURRENT ..._SUMMARY_BY_INSTANCE ..._SUMMARY_BY_EVENT_NAME +@endverbatim + + @section ON_THE_FLY On the fly aggregates + + 'On the fly' aggregates are computed during the code execution. + This is necessary because the data the aggregate is based on is volatile, + and can not be kept indefinitely. + + @section HIGHER_LEVEL Higher level aggregates + + Note: no higher level aggregate is implemented yet, + this section is a place holder. +*/ + +/** + @defgroup Performance_schema Performance Schema + The performance schema component. + For details, see the + @ref PAGE_PERFORMANCE_SCHEMA "performance schema main page". + + @defgroup Performance_schema_implementation Performance Schema Implementation + @ingroup Performance_schema + + @defgroup Performance_schema_tables Performance Schema Tables + @ingroup Performance_schema_implementation +*/ + +pthread_key(PFS_thread*, THR_PFS); +bool THR_PFS_initialized= false; + +static enum_operation_type mutex_operation_map[]= +{ + OPERATION_TYPE_LOCK, + OPERATION_TYPE_TRYLOCK +}; + +static enum_operation_type rwlock_operation_map[]= +{ + OPERATION_TYPE_READLOCK, + OPERATION_TYPE_WRITELOCK, + OPERATION_TYPE_TRYREADLOCK, + OPERATION_TYPE_TRYWRITELOCK +}; + +static enum_operation_type cond_operation_map[]= +{ + OPERATION_TYPE_WAIT, + OPERATION_TYPE_TIMEDWAIT +}; + +/** + Conversion map from PSI_file_operation to enum_operation_type. + Indexed by enum PSI_file_operation. +*/ +static enum_operation_type file_operation_map[]= +{ + OPERATION_TYPE_FILECREATE, + OPERATION_TYPE_FILECREATETMP, + OPERATION_TYPE_FILEOPEN, + OPERATION_TYPE_FILESTREAMOPEN, + OPERATION_TYPE_FILECLOSE, + OPERATION_TYPE_FILESTREAMCLOSE, + OPERATION_TYPE_FILEREAD, + OPERATION_TYPE_FILEWRITE, + OPERATION_TYPE_FILESEEK, + OPERATION_TYPE_FILETELL, + OPERATION_TYPE_FILEFLUSH, + OPERATION_TYPE_FILESTAT, + OPERATION_TYPE_FILEFSTAT, + OPERATION_TYPE_FILECHSIZE, + OPERATION_TYPE_FILEDELETE, + OPERATION_TYPE_FILERENAME, + OPERATION_TYPE_FILESYNC +}; + +/** + Build the prefix name of a class of instruments in a category. + For example, this function builds the string 'wait/sync/mutex/sql/' from + a prefix 'wait/sync/mutex' and a category 'sql'. + This prefix is used later to build each instrument name, such as + 'wait/sync/mutex/sql/LOCK_open'. + @param prefix Prefix for this class of instruments + @param category Category name + @param [out] output Buffer of length PFS_MAX_INFO_NAME_LENGTH. + @param [out] output_length Length of the resulting output string. + @return 0 for success, non zero for errors +*/ +static int build_prefix(const LEX_STRING *prefix, const char *category, + char *output, int *output_length) +{ + int len= strlen(category); + char *out_ptr= output; + int prefix_length= prefix->length; + + if (unlikely((prefix_length + len + 1) >= + PFS_MAX_FULL_PREFIX_NAME_LENGTH)) + { + pfs_print_error("build_prefix: prefix+category is too long <%s> <%s>\n", + prefix->str, category); + return 1; + } + + if (unlikely(strchr(category, '/') != NULL)) + { + pfs_print_error("build_prefix: invalid category <%s>\n", + category); + return 1; + } + + /* output = prefix + category + '/' */ + memcpy(out_ptr, prefix->str, prefix_length); + out_ptr+= prefix_length; + memcpy(out_ptr, category, len); + out_ptr+= len; + *out_ptr= '/'; + out_ptr++; + *output_length= out_ptr - output; + + return 0; +} + +#define REGISTER_BODY_V1(KEY_T, PREFIX, REGISTER_FUNC) \ + KEY_T key; \ + char formatted_name[PFS_MAX_INFO_NAME_LENGTH]; \ + int prefix_length; \ + int len; \ + int full_length; \ + \ + DBUG_ASSERT(category != NULL); \ + DBUG_ASSERT(info != NULL); \ + if (unlikely(build_prefix(&PREFIX, category, \ + formatted_name, &prefix_length))) \ + { \ + for (; count>0; count--, info++) \ + *(info->m_key)= 0; \ + return ; \ + } \ + \ + for (; count>0; count--, info++) \ + { \ + DBUG_ASSERT(info->m_key != NULL); \ + DBUG_ASSERT(info->m_name != NULL); \ + len= strlen(info->m_name); \ + full_length= prefix_length + len; \ + if (likely(full_length <= PFS_MAX_INFO_NAME_LENGTH)) \ + { \ + memcpy(formatted_name + prefix_length, info->m_name, len); \ + key= REGISTER_FUNC(formatted_name, full_length, info->m_flags); \ + } \ + else \ + { \ + pfs_print_error("REGISTER_BODY_V1: name too long <%s> <%s>\n", \ + category, info->m_name); \ + key= 0; \ + } \ + \ + *(info->m_key)= key; \ + } \ + return; + +static void register_mutex_v1(const char *category, + PSI_mutex_info_v1 *info, + int count) +{ + REGISTER_BODY_V1(PSI_mutex_key, + mutex_instrument_prefix, + register_mutex_class) +} + +static void register_rwlock_v1(const char *category, + PSI_rwlock_info_v1 *info, + int count) +{ + REGISTER_BODY_V1(PSI_rwlock_key, + rwlock_instrument_prefix, + register_rwlock_class) +} + +static void register_cond_v1(const char *category, + PSI_cond_info_v1 *info, + int count) +{ + REGISTER_BODY_V1(PSI_cond_key, + cond_instrument_prefix, + register_cond_class) +} + +static void register_thread_v1(const char *category, + PSI_thread_info_v1 *info, + int count) +{ + REGISTER_BODY_V1(PSI_thread_key, + thread_instrument_prefix, + register_thread_class) +} + +static void register_file_v1(const char *category, + PSI_file_info_v1 *info, + int count) +{ + REGISTER_BODY_V1(PSI_file_key, + file_instrument_prefix, + register_file_class) +} + +#define INIT_BODY_V1(T, KEY, ID) \ + PFS_##T##_class *klass; \ + PFS_##T *pfs; \ + PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS); \ + if (unlikely(pfs_thread == NULL)) \ + return NULL; \ + if (! pfs_thread->m_enabled) \ + return NULL; \ + klass= find_##T##_class(KEY); \ + if (unlikely(klass == NULL)) \ + return NULL; \ + if (! klass->m_enabled) \ + return NULL; \ + pfs= create_##T(klass, ID); \ + return reinterpret_cast<PSI_##T *> (pfs) + +static PSI_mutex* +init_mutex_v1(PSI_mutex_key key, const void *identity) +{ + INIT_BODY_V1(mutex, key, identity); +} + +static void destroy_mutex_v1(PSI_mutex* mutex) +{ + PFS_mutex *pfs= reinterpret_cast<PFS_mutex*> (mutex); + destroy_mutex(pfs); +} + +static PSI_rwlock* +init_rwlock_v1(PSI_rwlock_key key, const void *identity) +{ + INIT_BODY_V1(rwlock, key, identity); +} + +static void destroy_rwlock_v1(PSI_rwlock* rwlock) +{ + PFS_rwlock *pfs= reinterpret_cast<PFS_rwlock*> (rwlock); + destroy_rwlock(pfs); +} + +static PSI_cond* +init_cond_v1(PSI_cond_key key, const void *identity) +{ + INIT_BODY_V1(cond, key, identity); +} + +static void destroy_cond_v1(PSI_cond* cond) +{ + PFS_cond *pfs= reinterpret_cast<PFS_cond*> (cond); + destroy_cond(pfs); +} + +static PSI_table_share* +get_table_share_v1(const char *schema_name, int schema_name_length, + const char *table_name, int table_name_length, + const void *identity) +{ +#ifdef HAVE_TABLE_WAIT + PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS); + if (unlikely(pfs_thread == NULL)) + return NULL; + PFS_table_share* share; + share= find_or_create_table_share(pfs_thread, + schema_name, schema_name_length, + table_name, table_name_length); + return reinterpret_cast<PSI_table_share*> (share); +#else + return NULL; +#endif +} + +static void release_table_share_v1(PSI_table_share* share) +{ + /* + To be implemented by WL#4895 PERFORMANCE_SCHEMA Instrumenting Table IO. + */ +} + +static PSI_table* +open_table_v1(PSI_table_share *share, const void *identity) +{ + PFS_table_share *pfs_table_share= + reinterpret_cast<PFS_table_share*> (share); + PFS_table *pfs_table; + DBUG_ASSERT(pfs_table_share); + pfs_table= create_table(pfs_table_share, identity); + return reinterpret_cast<PSI_table *> (pfs_table); +} + +static void close_table_v1(PSI_table *table) +{ + PFS_table *pfs= reinterpret_cast<PFS_table*> (table); + DBUG_ASSERT(pfs); + destroy_table(pfs); +} + +static void create_file_v1(PSI_file_key key, const char *name, File file) +{ + int index= (int) file; + if (unlikely(index < 0)) + return; + PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS); + if (unlikely(pfs_thread == NULL)) + return; + if (! pfs_thread->m_enabled) + return; + PFS_file_class *klass= find_file_class(key); + if (unlikely(klass == NULL)) + return; + if (! klass->m_enabled) + return; + if (likely(index < file_handle_max)) + { + uint len= strlen(name); + PFS_file *pfs= find_or_create_file(pfs_thread, klass, name, len); + file_handle_array[index]= pfs; + } + else + file_handle_lost++; +} + +struct PFS_spawn_thread_arg +{ + PFS_thread *m_parent_thread; + PSI_thread_key m_child_key; + const void *m_child_identity; + void *(*m_user_start_routine)(void*); + void *m_user_arg; +}; + +void* pfs_spawn_thread(void *arg) +{ + PFS_spawn_thread_arg *typed_arg= (PFS_spawn_thread_arg*) arg; + void *user_arg; + void *(*user_start_routine)(void*); + + PFS_thread *pfs; + + /* First, attach instrumentation to this newly created pthread. */ + PFS_thread_class *klass= find_thread_class(typed_arg->m_child_key); + if (likely(klass != NULL)) + pfs= create_thread(klass, typed_arg->m_child_identity, 0); + else + pfs= NULL; + my_pthread_setspecific_ptr(THR_PFS, pfs); + + /* + Secondly, free the memory allocated in spawn_thread_v1(). + It is preferable to do this before invoking the user + routine, to avoid memory leaks at shutdown, in case + the server exits without waiting for this thread. + */ + user_start_routine= typed_arg->m_user_start_routine; + user_arg= typed_arg->m_user_arg; + my_free(typed_arg, MYF(0)); + + /* Then, execute the user code for this thread. */ + (*user_start_routine)(user_arg); + + return NULL; +} + +static int spawn_thread_v1(PSI_thread_key key, + pthread_t *thread, const pthread_attr_t *attr, + void *(*start_routine)(void*), void *arg) +{ + PFS_spawn_thread_arg *psi_arg; + + /* psi_arg can not be global, and can not be a local variable. */ + psi_arg= (PFS_spawn_thread_arg*) my_malloc(sizeof(PFS_spawn_thread_arg), + MYF(MY_WME)); + if (unlikely(psi_arg == NULL)) + return EAGAIN; + + psi_arg->m_parent_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS); + psi_arg->m_child_key= key; + psi_arg->m_child_identity= (arg ? arg : thread); + psi_arg->m_user_start_routine= start_routine; + psi_arg->m_user_arg= arg; + + int result= pthread_create(thread, attr, pfs_spawn_thread, psi_arg); + if (unlikely(result != 0)) + my_free(psi_arg, MYF(0)); + return result; +} + +static PSI_thread* +new_thread_v1(PSI_thread_key key, const void *identity, ulong thread_id) +{ + PFS_thread *pfs; + + PFS_thread_class *klass= find_thread_class(key); + if (likely(klass != NULL)) + pfs= create_thread(klass, identity, thread_id); + else + pfs= NULL; + + return reinterpret_cast<PSI_thread*> (pfs); +} + +static void set_thread_id_v1(PSI_thread *thread, unsigned long id) +{ + DBUG_ASSERT(thread); + PFS_thread *pfs= reinterpret_cast<PFS_thread*> (thread); + pfs->m_thread_id= id; +} + +static PSI_thread* +get_thread_v1(void) +{ + PFS_thread *pfs= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS); + return reinterpret_cast<PSI_thread*> (pfs); +} + +static void set_thread_v1(PSI_thread* thread) +{ + PFS_thread *pfs= reinterpret_cast<PFS_thread*> (thread); + my_pthread_setspecific_ptr(THR_PFS, pfs); +} + +static void delete_current_thread_v1(void) +{ + PFS_thread *thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS); + if (thread != NULL) + { + my_pthread_setspecific_ptr(THR_PFS, NULL); + destroy_thread(thread); + } +} + +static PSI_mutex_locker* +get_thread_mutex_locker_v1(PSI_mutex *mutex, PSI_mutex_operation op) +{ + PFS_mutex *pfs_mutex= reinterpret_cast<PFS_mutex*> (mutex); + DBUG_ASSERT((int) op >= 0); + DBUG_ASSERT((uint) op < array_elements(mutex_operation_map)); + DBUG_ASSERT(pfs_mutex != NULL); + DBUG_ASSERT(pfs_mutex->m_class != NULL); + if (! flag_events_waits_current) + return NULL; + if (! pfs_mutex->m_class->m_enabled) + return NULL; + PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS); + if (unlikely(pfs_thread == NULL)) + return NULL; + if (! pfs_thread->m_enabled) + return NULL; + if (unlikely(pfs_thread->m_wait_locker_count >= LOCKER_STACK_SIZE)) + { + locker_lost++; + return NULL; + } + PFS_wait_locker *pfs_locker= &pfs_thread->m_wait_locker_stack + [pfs_thread->m_wait_locker_count]; + pfs_locker->m_waits_current.m_wait_class= NO_WAIT_CLASS; + + pfs_locker->m_target.m_mutex= pfs_mutex; + pfs_locker->m_waits_current.m_thread= pfs_thread; + pfs_locker->m_waits_current.m_class= pfs_mutex->m_class; + if (pfs_mutex->m_class->m_timed) + { + pfs_locker->m_timer_name= wait_timer; + pfs_locker->m_waits_current.m_timer_state= TIMER_STATE_STARTING; + } + else + pfs_locker->m_waits_current.m_timer_state= TIMER_STATE_UNTIMED; + pfs_locker->m_waits_current.m_object_instance_addr= pfs_mutex->m_identity; + pfs_locker->m_waits_current.m_event_id= pfs_thread->m_event_id++; + pfs_locker->m_waits_current.m_operation= mutex_operation_map[(int) op]; + pfs_locker->m_waits_current.m_wait_class= WAIT_CLASS_MUTEX; + + pfs_thread->m_wait_locker_count++; + return reinterpret_cast<PSI_mutex_locker*> (pfs_locker); +} + +static PSI_rwlock_locker* +get_thread_rwlock_locker_v1(PSI_rwlock *rwlock, PSI_rwlock_operation op) +{ + PFS_rwlock *pfs_rwlock= reinterpret_cast<PFS_rwlock*> (rwlock); + DBUG_ASSERT(static_cast<int> (op) >= 0); + DBUG_ASSERT(static_cast<uint> (op) < array_elements(rwlock_operation_map)); + DBUG_ASSERT(pfs_rwlock != NULL); + DBUG_ASSERT(pfs_rwlock->m_class != NULL); + if (! flag_events_waits_current) + return NULL; + if (! pfs_rwlock->m_class->m_enabled) + return NULL; + PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS); + if (unlikely(pfs_thread == NULL)) + return NULL; + if (! pfs_thread->m_enabled) + return NULL; + if (unlikely(pfs_thread->m_wait_locker_count >= LOCKER_STACK_SIZE)) + { + locker_lost++; + return NULL; + } + PFS_wait_locker *pfs_locker= &pfs_thread->m_wait_locker_stack + [pfs_thread->m_wait_locker_count]; + pfs_locker->m_waits_current.m_wait_class= NO_WAIT_CLASS; + + pfs_locker->m_target.m_rwlock= pfs_rwlock; + pfs_locker->m_waits_current.m_thread= pfs_thread; + pfs_locker->m_waits_current.m_class= pfs_rwlock->m_class; + if (pfs_rwlock->m_class->m_timed) + { + pfs_locker->m_timer_name= wait_timer; + pfs_locker->m_waits_current.m_timer_state= TIMER_STATE_STARTING; + } + else + pfs_locker->m_waits_current.m_timer_state= TIMER_STATE_UNTIMED; + pfs_locker->m_waits_current.m_object_instance_addr= pfs_rwlock->m_identity; + pfs_locker->m_waits_current.m_event_id= pfs_thread->m_event_id++; + pfs_locker->m_waits_current.m_operation= + rwlock_operation_map[static_cast<int> (op)]; + pfs_locker->m_waits_current.m_wait_class= WAIT_CLASS_RWLOCK; + + pfs_thread->m_wait_locker_count++; + return reinterpret_cast<PSI_rwlock_locker*> (pfs_locker); +} + +static PSI_cond_locker* +get_thread_cond_locker_v1(PSI_cond *cond, PSI_mutex * /* unused: mutex */, + PSI_cond_operation op) +{ + /* + Note about the unused PSI_mutex *mutex parameter: + In the pthread library, a call to pthread_cond_wait() + causes an unlock() + lock() on the mutex associated with the condition. + This mutex operation is not instrumented, so the mutex will still + appear as locked when a thread is waiting on a condition. + This has no impact now, as unlock_mutex() is not recording events. + When unlock_mutex() is implemented by later work logs, + this parameter here will be used to adjust the mutex state, + in start_cond_wait_v1() and end_cond_wait_v1(). + */ + PFS_cond *pfs_cond= reinterpret_cast<PFS_cond*> (cond); + DBUG_ASSERT(static_cast<int> (op) >= 0); + DBUG_ASSERT(static_cast<uint> (op) < array_elements(cond_operation_map)); + DBUG_ASSERT(pfs_cond != NULL); + DBUG_ASSERT(pfs_cond->m_class != NULL); + if (! flag_events_waits_current) + return NULL; + if (! pfs_cond->m_class->m_enabled) + return NULL; + PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS); + if (unlikely(pfs_thread == NULL)) + return NULL; + if (! pfs_thread->m_enabled) + return NULL; + if (unlikely(pfs_thread->m_wait_locker_count >= LOCKER_STACK_SIZE)) + { + locker_lost++; + return NULL; + } + PFS_wait_locker *pfs_locker= &pfs_thread->m_wait_locker_stack + [pfs_thread->m_wait_locker_count]; + pfs_locker->m_waits_current.m_wait_class= NO_WAIT_CLASS; + + pfs_locker->m_target.m_cond= pfs_cond; + pfs_locker->m_waits_current.m_thread= pfs_thread; + pfs_locker->m_waits_current.m_class= pfs_cond->m_class; + if (pfs_cond->m_class->m_timed) + { + pfs_locker->m_timer_name= wait_timer; + pfs_locker->m_waits_current.m_timer_state= TIMER_STATE_STARTING; + } + else + pfs_locker->m_waits_current.m_timer_state= TIMER_STATE_UNTIMED; + pfs_locker->m_waits_current.m_object_instance_addr= pfs_cond->m_identity; + pfs_locker->m_waits_current.m_event_id= pfs_thread->m_event_id++; + pfs_locker->m_waits_current.m_operation= + cond_operation_map[static_cast<int> (op)]; + pfs_locker->m_waits_current.m_wait_class= WAIT_CLASS_COND; + + pfs_thread->m_wait_locker_count++; + return reinterpret_cast<PSI_cond_locker*> (pfs_locker); +} + +static PSI_table_locker* +get_thread_table_locker_v1(PSI_table *table) +{ + PFS_table *pfs_table= reinterpret_cast<PFS_table*> (table); + DBUG_ASSERT(pfs_table != NULL); + DBUG_ASSERT(pfs_table->m_share != NULL); + if (! flag_events_waits_current) + return NULL; + if (! pfs_table->m_share->m_enabled) + return NULL; + PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS); + if (unlikely(pfs_thread == NULL)) + return NULL; + if (! pfs_thread->m_enabled) + return NULL; + if (unlikely(pfs_thread->m_wait_locker_count >= LOCKER_STACK_SIZE)) + { + locker_lost++; + return NULL; + } + PFS_wait_locker *pfs_locker= &pfs_thread->m_wait_locker_stack + [pfs_thread->m_wait_locker_count]; + pfs_locker->m_waits_current.m_wait_class= NO_WAIT_CLASS; + + pfs_locker->m_target.m_table= pfs_table; + pfs_locker->m_waits_current.m_thread= pfs_thread; + pfs_locker->m_waits_current.m_class= &global_table_class; + if (pfs_table->m_share->m_timed) + { + pfs_locker->m_timer_name= wait_timer; + pfs_locker->m_waits_current.m_timer_state= TIMER_STATE_STARTING; + } + else + pfs_locker->m_waits_current.m_timer_state= TIMER_STATE_UNTIMED; + pfs_locker->m_waits_current.m_object_instance_addr= pfs_table->m_identity; + pfs_locker->m_waits_current.m_event_id= pfs_thread->m_event_id++; + pfs_locker->m_waits_current.m_wait_class= WAIT_CLASS_TABLE; + + pfs_thread->m_wait_locker_count++; + return reinterpret_cast<PSI_table_locker*> (pfs_locker); +} + +static PSI_file_locker* +get_thread_file_name_locker_v1(PSI_file_key key, + PSI_file_operation op, + const char *name, const void *identity) +{ + DBUG_ASSERT(static_cast<int> (op) >= 0); + DBUG_ASSERT(static_cast<uint> (op) < array_elements(file_operation_map)); + + if (! flag_events_waits_current) + return NULL; + PFS_file_class *klass= find_file_class(key); + if (unlikely(klass == NULL)) + return NULL; + if (! klass->m_enabled) + return NULL; + PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS); + if (unlikely(pfs_thread == NULL)) + return NULL; + if (! pfs_thread->m_enabled) + return NULL; + if (unlikely(pfs_thread->m_wait_locker_count >= LOCKER_STACK_SIZE)) + { + locker_lost++; + return NULL; + } + uint len= strlen(name); + PFS_file *pfs_file= find_or_create_file(pfs_thread, klass, name, len); + if (unlikely(pfs_file == NULL)) + return NULL; + + PFS_wait_locker *pfs_locker= &pfs_thread->m_wait_locker_stack + [pfs_thread->m_wait_locker_count]; + pfs_locker->m_waits_current.m_wait_class= NO_WAIT_CLASS; + + pfs_locker->m_target.m_file= pfs_file; + pfs_locker->m_waits_current.m_thread= pfs_thread; + pfs_locker->m_waits_current.m_class= pfs_file->m_class; + if (pfs_file->m_class->m_timed) + { + pfs_locker->m_timer_name= wait_timer; + pfs_locker->m_waits_current.m_timer_state= TIMER_STATE_STARTING; + } + else + pfs_locker->m_waits_current.m_timer_state= TIMER_STATE_UNTIMED; + pfs_locker->m_waits_current.m_object_instance_addr= pfs_file; + pfs_locker->m_waits_current.m_object_name= pfs_file->m_filename; + pfs_locker->m_waits_current.m_object_name_length= + pfs_file->m_filename_length; + pfs_locker->m_waits_current.m_event_id= pfs_thread->m_event_id++; + pfs_locker->m_waits_current.m_operation= + file_operation_map[static_cast<int> (op)]; + pfs_locker->m_waits_current.m_wait_class= WAIT_CLASS_FILE; + + pfs_thread->m_wait_locker_count++; + return reinterpret_cast<PSI_file_locker*> (pfs_locker); +} + +static PSI_file_locker* +get_thread_file_stream_locker_v1(PSI_file *file, PSI_file_operation op) +{ + PFS_file *pfs_file= reinterpret_cast<PFS_file*> (file); + + DBUG_ASSERT(static_cast<int> (op) >= 0); + DBUG_ASSERT(static_cast<uint> (op) < array_elements(file_operation_map)); + DBUG_ASSERT(pfs_file != NULL); + DBUG_ASSERT(pfs_file->m_class != NULL); + + if (! flag_events_waits_current) + return NULL; + if (! pfs_file->m_class->m_enabled) + return NULL; + PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS); + if (unlikely(pfs_thread == NULL)) + return NULL; + if (! pfs_thread->m_enabled) + return NULL; + if (unlikely(pfs_thread->m_wait_locker_count >= LOCKER_STACK_SIZE)) + { + locker_lost++; + return NULL; + } + PFS_wait_locker *pfs_locker= &pfs_thread->m_wait_locker_stack + [pfs_thread->m_wait_locker_count]; + pfs_locker->m_waits_current.m_wait_class= NO_WAIT_CLASS; + + pfs_locker->m_target.m_file= pfs_file; + pfs_locker->m_waits_current.m_thread= pfs_thread; + pfs_locker->m_waits_current.m_class= pfs_file->m_class; + if (pfs_file->m_class->m_timed) + { + pfs_locker->m_timer_name= wait_timer; + pfs_locker->m_waits_current.m_timer_state= TIMER_STATE_STARTING; + } + else + pfs_locker->m_waits_current.m_timer_state= TIMER_STATE_UNTIMED; + pfs_locker->m_waits_current.m_object_instance_addr= pfs_file; + pfs_locker->m_waits_current.m_object_name= pfs_file->m_filename; + pfs_locker->m_waits_current.m_object_name_length= + pfs_file->m_filename_length; + pfs_locker->m_waits_current.m_event_id= pfs_thread->m_event_id++; + pfs_locker->m_waits_current.m_operation= + file_operation_map[static_cast<int> (op)]; + pfs_locker->m_waits_current.m_wait_class= WAIT_CLASS_FILE; + + pfs_thread->m_wait_locker_count++; + return reinterpret_cast<PSI_file_locker*> (pfs_locker); +} + +static PSI_file_locker* +get_thread_file_descriptor_locker_v1(File file, PSI_file_operation op) +{ + int index= static_cast<int> (file); + + DBUG_ASSERT(static_cast<int> (op) >= 0); + DBUG_ASSERT(static_cast<uint> (op) < array_elements(file_operation_map)); + + if (! flag_events_waits_current) + return NULL; + if (likely((index >= 0) && (index < file_handle_max))) + { + PFS_file *pfs_file= file_handle_array[index]; + if (likely(pfs_file != NULL)) + { + PFS_thread *pfs_thread; + + /* + We are about to close a file by descriptor number, + and the calling code still holds the descriptor. + Cleanup the file descriptor <--> file instrument association. + Remove the instrumentation *before* the close to avoid race + conditions with another thread opening a file + (that could be given the same descriptor). + */ + if (op == PSI_FILE_CLOSE) + file_handle_array[index]= NULL; + + DBUG_ASSERT(pfs_file->m_class != NULL); + if (! pfs_file->m_class->m_enabled) + return NULL; + pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS); + if (unlikely(pfs_thread == NULL)) + return NULL; + if (! pfs_thread->m_enabled) + return NULL; + if (unlikely(pfs_thread->m_wait_locker_count >= LOCKER_STACK_SIZE)) + { + locker_lost++; + return NULL; + } + PFS_wait_locker *pfs_locker= &pfs_thread->m_wait_locker_stack + [pfs_thread->m_wait_locker_count]; + pfs_locker->m_waits_current.m_wait_class= NO_WAIT_CLASS; + + pfs_locker->m_target.m_file= pfs_file; + pfs_locker->m_waits_current.m_thread= pfs_thread; + pfs_locker->m_waits_current.m_class= pfs_file->m_class; + if (pfs_file->m_class->m_timed) + { + pfs_locker->m_timer_name= wait_timer; + pfs_locker->m_waits_current.m_timer_state= TIMER_STATE_STARTING; + } + else + pfs_locker->m_waits_current.m_timer_state= TIMER_STATE_UNTIMED; + pfs_locker->m_waits_current.m_object_instance_addr= pfs_file; + pfs_locker->m_waits_current.m_object_name= pfs_file->m_filename; + pfs_locker->m_waits_current.m_object_name_length= + pfs_file->m_filename_length; + pfs_locker->m_waits_current.m_event_id= pfs_thread->m_event_id++; + pfs_locker->m_waits_current.m_operation= + file_operation_map[static_cast<int> (op)]; + pfs_locker->m_waits_current.m_wait_class= WAIT_CLASS_FILE; + + pfs_thread->m_wait_locker_count++; + return reinterpret_cast<PSI_file_locker*> (pfs_locker); + } + } + return NULL; +} + +static void unlock_mutex_v1(PSI_thread * thread, PSI_mutex *mutex) +{ + PFS_mutex *pfs_mutex= reinterpret_cast<PFS_mutex*> (mutex); + DBUG_ASSERT(pfs_mutex != NULL); + + /* + Note that this code is still protected by the instrumented mutex, + and therefore is thread safe. See inline_mysql_mutex_unlock(). + */ + + /* Always update the instrumented state */ + pfs_mutex->m_owner= NULL; + pfs_mutex->m_last_locked= 0; + +#ifdef LATER_WL2333 + /* + See WL#2333: SHOW ENGINE ... LOCK STATUS. + PFS_mutex::m_lock_stat is not exposed in user visible tables + currently, so there is no point spending time computing it. + */ + PFS_thread *pfs_thread= reinterpret_cast<PFS_thread*> (thread); + DBUG_ASSERT(pfs_thread != NULL); + + if (unlikely(! flag_events_waits_current)) + return; + if (! pfs_mutex->m_class->m_enabled) + return; + if (! pfs_thread->m_enabled) + return; + + if (pfs_mutex->m_class->m_timed) + { + ulonglong locked_time; + locked_time= get_timer_value(wait_timer) - pfs_mutex->m_last_locked; + aggregate_single_stat_chain(&pfs_mutex->m_lock_stat, locked_time); + } +#endif +} + +static void unlock_rwlock_v1(PSI_thread *thread, PSI_rwlock *rwlock) +{ + PFS_rwlock *pfs_rwlock= reinterpret_cast<PFS_rwlock*> (rwlock); + DBUG_ASSERT(pfs_rwlock != NULL); + bool last_writer= false; + bool last_reader= false; + + /* + Note that this code is still protected by the instrumented rwlock, + and therefore is: + - thread safe for write locks + - almost thread safe for read locks (pfs_rwlock->m_readers is unsafe). + See inline_mysql_rwlock_unlock() + */ + + /* Always update the instrumented state */ + if (pfs_rwlock->m_writer) + { + /* Nominal case, a writer is unlocking. */ + last_writer= true; + pfs_rwlock->m_writer= NULL; + /* Reset the readers stats, they could be off */ + pfs_rwlock->m_readers= 0; + } + else if (likely(pfs_rwlock->m_readers > 0)) + { + /* Nominal case, a reader is unlocking. */ + if (--(pfs_rwlock->m_readers) == 0) + last_reader= true; + } + else + { + /* + Edge case, we have no writer and no readers, + on an unlock event. + This is possible for: + - partial instrumentation + - instrumentation disabled at runtime, + see when get_thread_rwlock_locker_v1() returns NULL + No further action is taken here, the next + write lock will put the statistics is a valid state. + */ + } + +#ifdef LATER_WL2333 + /* See WL#2333: SHOW ENGINE ... LOCK STATUS. */ + PFS_thread *pfs_thread= reinterpret_cast<PFS_thread*> (thread); + DBUG_ASSERT(pfs_thread != NULL); + + if (unlikely(! flag_events_waits_current)) + return; + if (! pfs_rwlock->m_class->m_enabled) + return; + if (! pfs_thread->m_enabled) + return; + + ulonglong locked_time; + if (last_writer) + { + if (pfs_rwlock->m_class->m_timed) + { + locked_time= get_timer_value(wait_timer) - pfs_rwlock->m_last_written; + aggregate_single_stat_chain(&pfs_rwlock->m_write_lock_stat, locked_time); + } + } + else if (last_reader) + { + if (pfs_rwlock->m_class->m_timed) + { + locked_time= get_timer_value(wait_timer) - pfs_rwlock->m_last_read; + aggregate_single_stat_chain(&pfs_rwlock->m_read_lock_stat, locked_time); + } + } +#endif +} + +static void signal_cond_v1(PSI_thread *thread, PSI_cond* cond) +{ + PFS_cond *pfs_cond= reinterpret_cast<PFS_cond*> (cond); + DBUG_ASSERT(pfs_cond != NULL); + + pfs_cond->m_cond_stat.m_signal_count++; +} + +static void broadcast_cond_v1(PSI_thread *thread, PSI_cond* cond) +{ + PFS_cond *pfs_cond= reinterpret_cast<PFS_cond*> (cond); + DBUG_ASSERT(pfs_cond != NULL); + + pfs_cond->m_cond_stat.m_broadcast_count++; +} + +static void start_mutex_wait_v1(PSI_mutex_locker* locker, + const char *src_file, uint src_line) +{ + PFS_wait_locker *pfs_locker= reinterpret_cast<PFS_wait_locker*> (locker); + DBUG_ASSERT(pfs_locker != NULL); + + PFS_events_waits *wait= &pfs_locker->m_waits_current; + if (wait->m_timer_state == TIMER_STATE_STARTING) + { + wait->m_timer_start= get_timer_value(pfs_locker->m_timer_name); + wait->m_timer_state= TIMER_STATE_STARTED; + } + wait->m_source_file= src_file; + wait->m_source_line= src_line; +} + +static void end_mutex_wait_v1(PSI_mutex_locker* locker, int rc) +{ + PFS_wait_locker *pfs_locker= reinterpret_cast<PFS_wait_locker*> (locker); + DBUG_ASSERT(pfs_locker != NULL); + PFS_events_waits *wait= &pfs_locker->m_waits_current; + + if (wait->m_timer_state == TIMER_STATE_STARTED) + { + wait->m_timer_end= get_timer_value(pfs_locker->m_timer_name); + wait->m_timer_state= TIMER_STATE_TIMED; + } + if (flag_events_waits_history) + insert_events_waits_history(wait->m_thread, wait); + if (flag_events_waits_history_long) + insert_events_waits_history_long(wait); + + if (rc == 0) + { + /* Thread safe: we are protected by the instrumented mutex */ + PFS_single_stat_chain *stat; + PFS_mutex *mutex= pfs_locker->m_target.m_mutex; + mutex->m_owner= wait->m_thread; + mutex->m_last_locked= wait->m_timer_end; + + ulonglong wait_time= wait->m_timer_end - wait->m_timer_start; + aggregate_single_stat_chain(&mutex->m_wait_stat, wait_time); + stat= find_per_thread_mutex_class_wait_stat(wait->m_thread, + mutex->m_class); + aggregate_single_stat_chain(stat, wait_time); + } + wait->m_thread->m_wait_locker_count--; +} + +static void start_rwlock_rdwait_v1(PSI_rwlock_locker* locker, + const char *src_file, uint src_line) +{ + PFS_wait_locker *pfs_locker= reinterpret_cast<PFS_wait_locker*> (locker); + DBUG_ASSERT(pfs_locker != NULL); + + PFS_events_waits *wait= &pfs_locker->m_waits_current; + if (wait->m_timer_state == TIMER_STATE_STARTING) + { + wait->m_timer_start= get_timer_value(pfs_locker->m_timer_name); + wait->m_timer_state= TIMER_STATE_STARTED; + } + wait->m_source_file= src_file; + wait->m_source_line= src_line; +} + +static void end_rwlock_rdwait_v1(PSI_rwlock_locker* locker, int rc) +{ + PFS_wait_locker *pfs_locker= reinterpret_cast<PFS_wait_locker*> (locker); + DBUG_ASSERT(pfs_locker != NULL); + PFS_events_waits *wait= &pfs_locker->m_waits_current; + + if (wait->m_timer_state == TIMER_STATE_STARTED) + { + wait->m_timer_end= get_timer_value(pfs_locker->m_timer_name); + wait->m_timer_state= TIMER_STATE_TIMED; + } + if (flag_events_waits_history) + insert_events_waits_history(wait->m_thread, wait); + if (flag_events_waits_history_long) + insert_events_waits_history_long(wait); + + if (rc == 0) + { + /* + Warning: + Multiple threads can execute this section concurrently + (since multiple readers can execute in parallel). + The statistics generated are not safe, which is why they are + just statistics, not facts. + */ + PFS_single_stat_chain *stat; + PFS_rwlock *rwlock= pfs_locker->m_target.m_rwlock; + if (rwlock->m_readers == 0) + rwlock->m_last_read= wait->m_timer_end; + rwlock->m_writer= NULL; + rwlock->m_readers++; + + ulonglong wait_time= wait->m_timer_end - wait->m_timer_start; + aggregate_single_stat_chain(&rwlock->m_wait_stat, wait_time); + stat= find_per_thread_rwlock_class_wait_stat(wait->m_thread, + rwlock->m_class); + aggregate_single_stat_chain(stat, wait_time); + } + wait->m_thread->m_wait_locker_count--; +} + +static void start_rwlock_wrwait_v1(PSI_rwlock_locker* locker, + const char *src_file, uint src_line) +{ + PFS_wait_locker *pfs_locker= reinterpret_cast<PFS_wait_locker*> (locker); + DBUG_ASSERT(pfs_locker != NULL); + + PFS_events_waits *wait= &pfs_locker->m_waits_current; + if (wait->m_timer_state == TIMER_STATE_STARTING) + { + wait->m_timer_start= get_timer_value(pfs_locker->m_timer_name); + wait->m_timer_state= TIMER_STATE_STARTED; + } + wait->m_source_file= src_file; + wait->m_source_line= src_line; +} + +static void end_rwlock_wrwait_v1(PSI_rwlock_locker* locker, int rc) +{ + PFS_wait_locker *pfs_locker= reinterpret_cast<PFS_wait_locker*> (locker); + DBUG_ASSERT(pfs_locker != NULL); + PFS_events_waits *wait= &pfs_locker->m_waits_current; + + if (wait->m_timer_state == TIMER_STATE_STARTED) + { + wait->m_timer_end= get_timer_value(pfs_locker->m_timer_name); + wait->m_timer_state= TIMER_STATE_TIMED; + } + if (flag_events_waits_history) + insert_events_waits_history(wait->m_thread, wait); + if (flag_events_waits_history_long) + insert_events_waits_history_long(wait); + + if (rc == 0) + { + /* Thread safe : we are protected by the instrumented rwlock */ + PFS_single_stat_chain *stat; + PFS_rwlock *rwlock= pfs_locker->m_target.m_rwlock; + rwlock->m_writer= wait->m_thread; + rwlock->m_last_written= wait->m_timer_end; + /* Reset the readers stats, they could be off */ + rwlock->m_readers= 0; + rwlock->m_last_read= 0; + + ulonglong wait_time= wait->m_timer_end - wait->m_timer_start; + aggregate_single_stat_chain(&rwlock->m_wait_stat, wait_time); + stat= find_per_thread_rwlock_class_wait_stat(wait->m_thread, + rwlock->m_class); + aggregate_single_stat_chain(stat, wait_time); + } + wait->m_thread->m_wait_locker_count--; +} + +static void start_cond_wait_v1(PSI_cond_locker* locker, + const char *src_file, uint src_line) +{ + PFS_wait_locker *pfs_locker= reinterpret_cast<PFS_wait_locker*> (locker); + DBUG_ASSERT(pfs_locker != NULL); + + PFS_events_waits *wait= &pfs_locker->m_waits_current; + if (wait->m_timer_state == TIMER_STATE_STARTING) + { + wait->m_timer_start= get_timer_value(pfs_locker->m_timer_name); + wait->m_timer_state= TIMER_STATE_STARTED; + } + wait->m_source_file= src_file; + wait->m_source_line= src_line; +} + +static void end_cond_wait_v1(PSI_cond_locker* locker, int rc) +{ + PFS_wait_locker *pfs_locker= reinterpret_cast<PFS_wait_locker*> (locker); + DBUG_ASSERT(pfs_locker != NULL); + PFS_events_waits *wait= &pfs_locker->m_waits_current; + + if (wait->m_timer_state == TIMER_STATE_STARTED) + { + wait->m_timer_end= get_timer_value(pfs_locker->m_timer_name); + wait->m_timer_state= TIMER_STATE_TIMED; + } + if (flag_events_waits_history) + insert_events_waits_history(wait->m_thread, wait); + if (flag_events_waits_history_long) + insert_events_waits_history_long(wait); + + if (rc == 0) + { + /* + Not thread safe, race conditions will occur. + A first race condition is: + - thread 1 waits on cond A + - thread 2 waits on cond B + threads 1 and 2 compete when updating the same cond A + statistics, possibly missing a min / max / sum / count. + A second race condition is: + - thread 1 waits on cond A + - thread 2 destroys cond A + - thread 2 or 3 creates cond B in the same condition slot + thread 1 will then aggregate statistics about defunct A + in condition B. + This is accepted, the data will be slightly inaccurate. + */ + PFS_single_stat_chain *stat; + PFS_cond *cond= pfs_locker->m_target.m_cond; + + ulonglong wait_time= wait->m_timer_end - wait->m_timer_start; + aggregate_single_stat_chain(&cond->m_wait_stat, wait_time); + stat= find_per_thread_cond_class_wait_stat(wait->m_thread, + cond->m_class); + aggregate_single_stat_chain(stat, wait_time); + } + wait->m_thread->m_wait_locker_count--; +} + +static void start_table_wait_v1(PSI_table_locker* locker, + const char *src_file, uint src_line) +{ + PFS_wait_locker *pfs_locker= reinterpret_cast<PFS_wait_locker*> (locker); + DBUG_ASSERT(pfs_locker != NULL); + + PFS_events_waits *wait= &pfs_locker->m_waits_current; + if (wait->m_timer_state == TIMER_STATE_STARTING) + { + wait->m_timer_start= get_timer_value(pfs_locker->m_timer_name); + wait->m_timer_state= TIMER_STATE_STARTED; + } + wait->m_source_file= src_file; + wait->m_source_line= src_line; + wait->m_operation= OPERATION_TYPE_LOCK; + PFS_table_share *share= pfs_locker->m_target.m_table->m_share; + wait->m_schema_name= share->m_schema_name; + wait->m_schema_name_length= share->m_schema_name_length; + wait->m_object_name= share->m_table_name; + wait->m_object_name_length= share->m_table_name_length; +} + +static void end_table_wait_v1(PSI_table_locker* locker) +{ + PFS_wait_locker *pfs_locker= reinterpret_cast<PFS_wait_locker*> (locker); + DBUG_ASSERT(pfs_locker != NULL); + PFS_events_waits *wait= &pfs_locker->m_waits_current; + + if (wait->m_timer_state == TIMER_STATE_STARTED) + { + wait->m_timer_end= get_timer_value(pfs_locker->m_timer_name); + wait->m_timer_state= TIMER_STATE_TIMED; + } + if (flag_events_waits_history) + insert_events_waits_history(wait->m_thread, wait); + if (flag_events_waits_history_long) + insert_events_waits_history_long(wait); + + PFS_table *table= pfs_locker->m_target.m_table; + ulonglong wait_time= wait->m_timer_end - wait->m_timer_start; + aggregate_single_stat_chain(&table->m_wait_stat, wait_time); + + /* + There is currently no per table and per thread aggregation. + The number of tables in the application is arbitrary, and may be high. + The number of slots per thread to hold aggregates is fixed, + and is constrained by memory. + Implementing a per thread and per table aggregate has not been + decided yet. + If it's implemented, it's likely that the user will have to specify, + per table name, if the aggregate per thread is to be computed or not. + This will mean a SETUP_ table. + */ + wait->m_thread->m_wait_locker_count--; +} + +static void start_file_wait_v1(PSI_file_locker *locker, + size_t count, + const char *src_file, + uint src_line); + +static void end_file_wait_v1(PSI_file_locker *locker, + size_t count); + +static PSI_file* start_file_open_wait_v1(PSI_file_locker *locker, + const char *src_file, + uint src_line) +{ + PFS_wait_locker *pfs_locker= reinterpret_cast<PFS_wait_locker*> (locker); + DBUG_ASSERT(pfs_locker != NULL); + + start_file_wait_v1(locker, 0, src_file, src_line); + + PFS_file *pfs_file= pfs_locker->m_target.m_file; + return reinterpret_cast<PSI_file*> (pfs_file); +} + +static void end_file_open_wait_v1(PSI_file_locker *locker) +{ + end_file_wait_v1(locker, 0); +} + +static void end_file_open_wait_and_bind_to_descriptor_v1 + (PSI_file_locker *locker, File file) +{ + int index= (int) file; + PFS_wait_locker *pfs_locker= reinterpret_cast<PFS_wait_locker*> (locker); + DBUG_ASSERT(pfs_locker != NULL); + + end_file_wait_v1(locker, 0); + + PFS_file *pfs_file= pfs_locker->m_target.m_file; + DBUG_ASSERT(pfs_file != NULL); + + if (likely(index >= 0)) + { + if (likely(index < file_handle_max)) + file_handle_array[index]= pfs_file; + else + file_handle_lost++; + } + else + release_file(pfs_file); +} + +static void start_file_wait_v1(PSI_file_locker *locker, + size_t count, + const char *src_file, + uint src_line) +{ + PFS_wait_locker *pfs_locker= reinterpret_cast<PFS_wait_locker*> (locker); + DBUG_ASSERT(pfs_locker != NULL); + + PFS_events_waits *wait= &pfs_locker->m_waits_current; + if (wait->m_timer_state == TIMER_STATE_STARTING) + { + wait->m_timer_start= get_timer_value(pfs_locker->m_timer_name); + wait->m_timer_state= TIMER_STATE_STARTED; + } + wait->m_source_file= src_file; + wait->m_source_line= src_line; + wait->m_number_of_bytes= count; +} + +static void end_file_wait_v1(PSI_file_locker *locker, + size_t count) +{ + PFS_wait_locker *pfs_locker= reinterpret_cast<PFS_wait_locker*> (locker); + DBUG_ASSERT(pfs_locker != NULL); + PFS_events_waits *wait= &pfs_locker->m_waits_current; + + wait->m_number_of_bytes= count; + if (wait->m_timer_state == TIMER_STATE_STARTED) + { + wait->m_timer_end= get_timer_value(pfs_locker->m_timer_name); + wait->m_timer_state= TIMER_STATE_TIMED; + } + if (flag_events_waits_history) + insert_events_waits_history(wait->m_thread, wait); + if (flag_events_waits_history_long) + insert_events_waits_history_long(wait); + + PFS_single_stat_chain *stat; + PFS_file *file= pfs_locker->m_target.m_file; + + ulonglong wait_time= wait->m_timer_end - wait->m_timer_start; + aggregate_single_stat_chain(&file->m_wait_stat, wait_time); + stat= find_per_thread_file_class_wait_stat(wait->m_thread, + file->m_class); + aggregate_single_stat_chain(stat, wait_time); + + PFS_file_class *klass= file->m_class; + + switch(wait->m_operation) + { + case OPERATION_TYPE_FILEREAD: + file->m_file_stat.m_count_read++; + file->m_file_stat.m_read_bytes+= count; + klass->m_file_stat.m_count_read++; + klass->m_file_stat.m_read_bytes+= count; + break; + case OPERATION_TYPE_FILEWRITE: + file->m_file_stat.m_count_write++; + file->m_file_stat.m_write_bytes+= count; + klass->m_file_stat.m_count_write++; + klass->m_file_stat.m_write_bytes+= count; + break; + case OPERATION_TYPE_FILECLOSE: + case OPERATION_TYPE_FILESTREAMCLOSE: + case OPERATION_TYPE_FILESTAT: + release_file(pfs_locker->m_target.m_file); + break; + case OPERATION_TYPE_FILEDELETE: + destroy_file(wait->m_thread, pfs_locker->m_target.m_file); + break; + default: + break; + } + + wait->m_thread->m_wait_locker_count--; +} + +PSI_v1 PFS_v1= +{ + register_mutex_v1, + register_rwlock_v1, + register_cond_v1, + register_thread_v1, + register_file_v1, + init_mutex_v1, + destroy_mutex_v1, + init_rwlock_v1, + destroy_rwlock_v1, + init_cond_v1, + destroy_cond_v1, + get_table_share_v1, + release_table_share_v1, + open_table_v1, + close_table_v1, + create_file_v1, + spawn_thread_v1, + new_thread_v1, + set_thread_id_v1, + get_thread_v1, + set_thread_v1, + delete_current_thread_v1, + get_thread_mutex_locker_v1, + get_thread_rwlock_locker_v1, + get_thread_cond_locker_v1, + get_thread_table_locker_v1, + get_thread_file_name_locker_v1, + get_thread_file_stream_locker_v1, + get_thread_file_descriptor_locker_v1, + unlock_mutex_v1, + unlock_rwlock_v1, + signal_cond_v1, + broadcast_cond_v1, + start_mutex_wait_v1, + end_mutex_wait_v1, + start_rwlock_rdwait_v1, + end_rwlock_rdwait_v1, + start_rwlock_wrwait_v1, + end_rwlock_wrwait_v1, + start_cond_wait_v1, + end_cond_wait_v1, + start_table_wait_v1, + end_table_wait_v1, + start_file_open_wait_v1, + end_file_open_wait_v1, + end_file_open_wait_and_bind_to_descriptor_v1, + start_file_wait_v1, + end_file_wait_v1 +}; + +static void* get_interface(int version) +{ + switch (version) + { + case PSI_VERSION_1: + return &PFS_v1; + default: + return NULL; + } +} + +struct PSI_bootstrap PFS_bootstrap= +{ + get_interface +}; + |