diff options
author | GitLab Bot <gitlab-bot@gitlab.com> | 2020-08-20 18:42:06 +0000 |
---|---|---|
committer | GitLab Bot <gitlab-bot@gitlab.com> | 2020-08-20 18:42:06 +0000 |
commit | 6e4e1050d9dba2b7b2523fdd1768823ab85feef4 (patch) | |
tree | 78be5963ec075d80116a932011d695dd33910b4e /qa/qa/service | |
parent | 1ce776de4ae122aba3f349c02c17cebeaa8ecf07 (diff) | |
download | gitlab-ce-6e4e1050d9dba2b7b2523fdd1768823ab85feef4.tar.gz |
Add latest changes from gitlab-org/gitlab@13-3-stable-ee
Diffstat (limited to 'qa/qa/service')
-rw-r--r-- | qa/qa/service/cluster_provider/k3s.rb | 1 | ||||
-rw-r--r-- | qa/qa/service/docker_run/gitlab_runner.rb | 2 | ||||
-rw-r--r-- | qa/qa/service/docker_run/k3s.rb | 6 | ||||
-rw-r--r-- | qa/qa/service/praefect_manager.rb | 370 | ||||
-rw-r--r-- | qa/qa/service/shellout.rb | 25 |
5 files changed, 331 insertions, 73 deletions
diff --git a/qa/qa/service/cluster_provider/k3s.rb b/qa/qa/service/cluster_provider/k3s.rb index 165de795683..cf916d148da 100644 --- a/qa/qa/service/cluster_provider/k3s.rb +++ b/qa/qa/service/cluster_provider/k3s.rb @@ -10,6 +10,7 @@ module QA def setup @k3s = Service::DockerRun::K3s.new.tap do |k3s| + k3s.remove! k3s.register! shell "kubectl config set-cluster k3s --server https://#{k3s.host_name}:6443 --insecure-skip-tls-verify" diff --git a/qa/qa/service/docker_run/gitlab_runner.rb b/qa/qa/service/docker_run/gitlab_runner.rb index 6022ee4ceab..e15047a0f1d 100644 --- a/qa/qa/service/docker_run/gitlab_runner.rb +++ b/qa/qa/service/docker_run/gitlab_runner.rb @@ -92,7 +92,7 @@ module QA CMD end - # Ping CloudFlare DNS, should fail + # Ping Cloudflare DNS, should fail # Ping Registry, should fail to resolve def prove_airgap gitlab_ip = Resolv.getaddress 'registry.gitlab.com' diff --git a/qa/qa/service/docker_run/k3s.rb b/qa/qa/service/docker_run/k3s.rb index da254497ff0..07211b220f1 100644 --- a/qa/qa/service/docker_run/k3s.rb +++ b/qa/qa/service/docker_run/k3s.rb @@ -33,10 +33,12 @@ module QA --name #{@name} --publish 6443:6443 --privileged - #{@image} server --cluster-secret some-secret + #{@image} server + --cluster-secret some-secret + --no-deploy traefik CMD - command.gsub!("--network #{network} ", '') unless QA::Runtime::Env.running_in_ci? + command.gsub!("--network #{network} --hostname #{host_name}", '') unless QA::Runtime::Env.running_in_ci? shell command end diff --git a/qa/qa/service/praefect_manager.rb b/qa/qa/service/praefect_manager.rb index a0433689e99..1f1761100c8 100644 --- a/qa/qa/service/praefect_manager.rb +++ b/qa/qa/service/praefect_manager.rb @@ -5,8 +5,12 @@ module QA class PraefectManager include Service::Shellout + attr_accessor :gitlab + + PrometheusQueryError = Class.new(StandardError) + def initialize - @gitlab = 'gitlab-gitaly-ha' + @gitlab = 'gitlab-gitaly-cluster' @praefect = 'praefect' @postgres = 'postgres' @primary_node = 'gitaly1' @@ -15,23 +19,37 @@ module QA @virtual_storage = 'default' end - def enable_writes - shell "docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml enable-writes -virtual-storage #{@virtual_storage}'" + # Executes the praefect `dataloss` command. + # + # @return [Boolean] whether dataloss has occurred + def dataloss? + wait_until_shell_command_matches(dataloss_command, /Outdated repositories/) end def replicated?(project_id) - shell %(docker exec gitlab-gitaly-ha bash -c 'gitlab-rake "gitlab:praefect:replicas[#{project_id}]"') do |line| - # The output of the rake task looks something like this: - # - # Project name | gitaly1 (primary) | gitaly2 | gitaly3 - # ---------------------------------------------------------------------------------------------------------------------------------------------------------------- - # gitaly_cluster-3aff1f2bd14e6c98 | 23c4422629234d62b62adacafd0a33a8364e8619 | 23c4422629234d62b62adacafd0a33a8364e8619 | 23c4422629234d62b62adacafd0a33a8364e8619 - # - # We want to confirm that the checksums are identical - break line.split('|').map(&:strip)[1..3].uniq.one? if line.start_with?("gitaly_cluster") + Support::Retrier.retry_until(raise_on_failure: false) do + replicas = wait_until_shell_command(%(docker exec #{@gitlab} bash -c 'gitlab-rake "gitlab:praefect:replicas[#{project_id}]"')) do |line| + QA::Runtime::Logger.debug(line.chomp) + # The output of the rake task looks something like this: + # + # Project name | gitaly1 (primary) | gitaly2 | gitaly3 + # ---------------------------------------------------------------------------------------------------------------------------------------------------------------- + # gitaly_cluster-3aff1f2bd14e6c98 | 23c4422629234d62b62adacafd0a33a8364e8619 | 23c4422629234d62b62adacafd0a33a8364e8619 | 23c4422629234d62b62adacafd0a33a8364e8619 + # + break line if line.start_with?('gitaly_cluster') + break nil if line.include?('Something went wrong when getting replicas') + end + next false unless replicas + + # We want to know if the checksums are identical + replicas&.split('|')&.map(&:strip)&.slice(1..3)&.uniq&.one? end end + def start_primary_node + start_node(@primary_node) + end + def start_praefect start_node(@praefect) end @@ -40,6 +58,14 @@ module QA stop_node(@praefect) end + def stop_secondary_node + stop_node(@secondary_node) + end + + def start_secondary_node + start_node(@secondary_node) + end + def start_node(name) shell "docker start #{name}" end @@ -49,40 +75,79 @@ module QA end def trigger_failover_by_stopping_primary_node + QA::Runtime::Logger.info("Stopping node #{@primary_node} to trigger failover") stop_node(@primary_node) + wait_for_new_primary end def clear_replication_queue - QA::Runtime::Logger.debug("Clearing the replication queue") - shell <<~CMD - docker exec --env PGPASSWORD=SQL_PASSWORD #{@postgres} \ - bash -c "psql -U postgres -d praefect_production -h postgres.test \ - -c \\"delete from replication_queue_job_lock; delete from replication_queue_lock; delete from replication_queue;\\"" - CMD + QA::Runtime::Logger.info("Clearing the replication queue") + shell sql_to_docker_exec_cmd( + <<~SQL + delete from replication_queue_job_lock; + delete from replication_queue_lock; + delete from replication_queue; + SQL + ) end def create_stalled_replication_queue - QA::Runtime::Logger.debug("Setting jobs in replication queue to `in_progress` and acquiring locks") - shell <<~CMD - docker exec --env PGPASSWORD=SQL_PASSWORD #{@postgres} \ - bash -c "psql -U postgres -d praefect_production -h postgres.test \ - -c \\"update replication_queue set state = 'in_progress'; - insert into replication_queue_job_lock (job_id, lock_id, triggered_at) - select id, rq.lock_id, created_at from replication_queue rq - left join replication_queue_job_lock rqjl on rq.id = rqjl.job_id - where state = 'in_progress' and rqjl.job_id is null; - update replication_queue_lock set acquired = 't';\\"" - CMD + QA::Runtime::Logger.info("Setting jobs in replication queue to `in_progress` and acquiring locks") + shell sql_to_docker_exec_cmd( + <<~SQL + update replication_queue set state = 'in_progress'; + insert into replication_queue_job_lock (job_id, lock_id, triggered_at) + select id, rq.lock_id, created_at from replication_queue rq + left join replication_queue_job_lock rqjl on rq.id = rqjl.job_id + where state = 'in_progress' and rqjl.job_id is null; + update replication_queue_lock set acquired = 't'; + SQL + ) + end + + # Reconciles the previous primary node with the current one + # I.e., it brings the previous primary node up-to-date + def reconcile_nodes + reconcile_node_with_node(@primary_node, current_primary_node) + end + + def reconcile_node_with_node(target, reference) + QA::Runtime::Logger.info("Reconcile #{target} with #{reference} on #{@virtual_storage}") + wait_until_shell_command_matches( + "docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml reconcile -virtual #{@virtual_storage} -target #{target} -reference #{reference} -f'", + /FINISHED: \d+ repos were checked for consistency/, + sleep_interval: 5, + retry_on_exception: true + ) + end + + def query_read_distribution + output = shell "docker exec #{@gitlab} bash -c 'curl -s http://localhost:9090/api/v1/query?query=gitaly_praefect_read_distribution'" do |line| + QA::Runtime::Logger.debug(line) + break line + end + result = JSON.parse(output) + + raise PrometheusQueryError, "Unable to query read distribution metrics" unless result['status'] == 'success' + + result['data']['result'].map { |result| { node: result['metric']['storage'], value: result['value'][1].to_i } } + end + + def replication_queue_incomplete_count + result = [] + shell sql_to_docker_exec_cmd("select count(*) from replication_queue where state = 'ready' or state = 'in_progress';") do |line| + result << line + end + # The result looks like: + # count + # ----- + # 1 + result[2].to_i end def replication_queue_lock_count result = [] - cmd = <<~CMD - docker exec --env PGPASSWORD=SQL_PASSWORD #{@postgres} \ - bash -c "psql -U postgres -d praefect_production -h postgres.test \ - -c \\"select count(*) from replication_queue_lock where acquired = 't';\\"" - CMD - shell cmd do |line| + shell sql_to_docker_exec_cmd("select count(*) from replication_queue_lock where acquired = 't';") do |line| result << line end # The result looks like: @@ -92,19 +157,65 @@ module QA result[2].to_i end - def reset_cluster - start_node(@praefect) + # Makes the original primary (gitaly1) the primary again by + # stopping the other nodes, waiting for gitaly1 to be made the + # primary again, and then it starts the other nodes and enables + # writes + def reset_primary_to_original + QA::Runtime::Logger.info("Checking primary node...") + + return if @primary_node == current_primary_node + + QA::Runtime::Logger.info("Reset primary node to #{@primary_node}") start_node(@primary_node) + stop_node(@secondary_node) + stop_node(@tertiary_node) + + wait_for_new_primary_node(@primary_node) + start_node(@secondary_node) start_node(@tertiary_node) - enable_writes + + wait_for_health_check_all_nodes + wait_for_reliable_connection + end + + def verify_storage_move(source_storage, destination_storage) + return if QA::Runtime::Env.dot_com? + + repo_path = verify_storage_move_from_gitaly(source_storage[:name]) + + destination_storage[:type] == :praefect ? verify_storage_move_to_praefect(repo_path, destination_storage[:name]) : verify_storage_move_to_gitaly(repo_path, destination_storage[:name]) end def wait_for_praefect + QA::Runtime::Logger.info('Wait until Praefect starts and is listening') wait_until_shell_command_matches( "docker exec #{@praefect} bash -c 'cat /var/log/gitlab/praefect/current'", /listening at tcp address/ ) + + # Praefect can fail to start if unable to dial one of the gitaly nodes + # See https://gitlab.com/gitlab-org/gitaly/-/issues/2847 + # We tail the logs to allow us to confirm if that is the problem if tests fail + + shell "docker exec #{@praefect} bash -c 'tail /var/log/gitlab/praefect/current'" do |line| + QA::Runtime::Logger.debug(line.chomp) + end + end + + def wait_for_new_primary_node(node) + QA::Runtime::Logger.info("Wait until #{node} is the primary node") + with_praefect_log do |log| + break true if log['msg'] == 'primary node changed' && log['newPrimary'] == node + end + end + + def wait_for_new_primary + QA::Runtime::Logger.info("Wait until a new primary node is selected") + with_praefect_log do |log| + break true if log['msg'] == 'primary node changed' + end end def wait_for_sql_ping @@ -114,68 +225,187 @@ module QA ) end + def wait_for_no_praefect_storage_error + # If a healthcheck error was the last message to be logged, we'll keep seeing that message even if it's no longer a problem + # That is, there's no message shown in the Praefect logs when the healthcheck succeeds + # To work around that we perform the gitaly check rake task, wait a few seconds, and then we confirm that no healthcheck errors appear + + QA::Runtime::Logger.info("Checking that Praefect does not report healthcheck errors with its gitaly nodes") + + Support::Waiter.wait_until(max_duration: 120) do + wait_for_gitaly_check + + sleep 5 + + shell "docker exec #{@praefect} bash -c 'tail -n 1 /var/log/gitlab/praefect/current'" do |line| + QA::Runtime::Logger.debug(line.chomp) + log = JSON.parse(line) + + break true if log['msg'] != 'error when pinging healthcheck' + rescue JSON::ParserError + # Ignore lines that can't be parsed as JSON + end + end + end + def wait_for_storage_nodes - nodes_confirmed = { - @primary_node => false, - @secondary_node => false, - @tertiary_node => false - } + wait_for_no_praefect_storage_error + + Support::Waiter.repeat_until(max_attempts: 3) do + nodes_confirmed = { + @primary_node => false, + @secondary_node => false, + @tertiary_node => false + } + + wait_until_shell_command("docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml dial-nodes'") do |line| + QA::Runtime::Logger.debug(line.chomp) - wait_until_shell_command("docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml dial-nodes'") do |line| - QA::Runtime::Logger.info(line.chomp) + nodes_confirmed.each_key do |node| + nodes_confirmed[node] = true if line =~ /SUCCESS: confirmed Gitaly storage "#{node}" in virtual storages \[#{@virtual_storage}\] is served/ + end - nodes_confirmed.each_key do |node| - nodes_confirmed[node] = true if line =~ /SUCCESS: confirmed Gitaly storage "#{node}" in virtual storages \[#{@virtual_storage}\] is served/ + nodes_confirmed.values.all? end + end + end + + def wait_for_health_check_current_primary_node + wait_for_health_check(current_primary_node) + end + + def wait_for_health_check_all_nodes + wait_for_health_check(@primary_node) + wait_for_health_check(@secondary_node) + wait_for_health_check(@tertiary_node) + end + + def wait_for_health_check(node) + QA::Runtime::Logger.info("Waiting for health check on #{node}") + wait_until_shell_command("docker exec #{node} bash -c 'cat /var/log/gitlab/gitaly/current'") do |line| + QA::Runtime::Logger.debug(line.chomp) + log = JSON.parse(line) + + log['grpc.request.fullMethod'] == '/grpc.health.v1.Health/Check' && log['grpc.code'] == 'OK' + rescue JSON::ParserError + # Ignore lines that can't be parsed as JSON + end + end + + def wait_for_secondary_node_health_check_failure + wait_for_health_check_failure(@secondary_node) + end + + def wait_for_health_check_failure(node) + QA::Runtime::Logger.info("Waiting for Praefect to record a health check failure on #{node}") + wait_until_shell_command("docker exec #{@praefect} bash -c 'tail -n 1 /var/log/gitlab/praefect/current'") do |line| + QA::Runtime::Logger.debug(line.chomp) + log = JSON.parse(line) - nodes_confirmed.values.all? + log['msg'] == 'error when pinging healthcheck' && log['storage'] == node + rescue JSON::ParserError + # Ignore lines that can't be parsed as JSON end end def wait_for_gitaly_check - storage_ok = false - check_finished = false + Support::Waiter.repeat_until(max_attempts: 3) do + storage_ok = false + check_finished = false - wait_until_shell_command("docker exec #{@gitlab} bash -c 'gitlab-rake gitlab:gitaly:check'") do |line| - QA::Runtime::Logger.info(line.chomp) + wait_until_shell_command("docker exec #{@gitlab} bash -c 'gitlab-rake gitlab:gitaly:check'") do |line| + QA::Runtime::Logger.debug(line.chomp) - storage_ok = true if line =~ /Gitaly: ... #{@virtual_storage} ... OK/ - check_finished = true if line =~ /Checking Gitaly ... Finished/ + storage_ok = true if line =~ /Gitaly: ... #{@virtual_storage} ... OK/ + check_finished = true if line =~ /Checking Gitaly ... Finished/ - storage_ok && check_finished + storage_ok && check_finished + end end end - def wait_for_gitlab_shell_check - wait_until_shell_command_matches( - "docker exec #{@gitlab} bash -c 'gitlab-rake gitlab:gitlab_shell:check'", - /Checking GitLab Shell ... Finished/ - ) + # Waits until there is an increase in the number of reads for + # any node compared to the number of reads provided. If a node + # has no pre-read data, consider it to have had zero reads. + def wait_for_read_count_change(pre_read_data) + diff_found = false + Support::Waiter.wait_until(sleep_interval: 5) do + query_read_distribution.each_with_index do |data, index| + diff_found = true if data[:value] > value_for_node(pre_read_data, data[:node]) + end + diff_found + end + end + + def value_for_node(data, node) + data.find(-> {{ value: 0 }}) { |item| item[:node] == node }[:value] end def wait_for_reliable_connection + QA::Runtime::Logger.info('Wait until GitLab and Praefect can communicate reliably') wait_for_praefect wait_for_sql_ping wait_for_storage_nodes wait_for_gitaly_check - wait_for_gitlab_shell_check + end + + def wait_for_replication(project_id) + Support::Waiter.wait_until(sleep_interval: 1) { replication_queue_incomplete_count == 0 && replicated?(project_id) } end private - def wait_until_shell_command(cmd) - Support::Waiter.wait_until do - shell cmd do |line| - break true if yield line - end + def current_primary_node + shell dataloss_command do |line| + QA::Runtime::Logger.debug(line.chomp) + + match = line.match(/Primary: (.*)/) + break match[1] if match + end + end + + def dataloss_command + "docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml dataloss'" + end + + def sql_to_docker_exec_cmd(sql) + Service::Shellout.sql_to_docker_exec_cmd(sql, 'postgres', 'SQL_PASSWORD', 'praefect_production', 'postgres.test', @postgres) + end + + def verify_storage_move_from_gitaly(storage) + wait_until_shell_command("docker exec #{@gitlab} bash -c 'tail -n 50 /var/log/gitlab/gitaly/current'") do |line| + log = JSON.parse(line) + + break log['grpc.request.repoPath'] if log['grpc.method'] == 'RenameRepository' && log['grpc.request.repoStorage'] == storage && !log['grpc.request.repoPath'].include?('wiki') + rescue JSON::ParserError + # Ignore lines that can't be parsed as JSON end end - def wait_until_shell_command_matches(cmd, regex) - wait_until_shell_command(cmd) do |line| - QA::Runtime::Logger.info(line.chomp) + def verify_storage_move_to_praefect(repo_path, virtual_storage) + wait_until_shell_command("docker exec #{@gitlab} bash -c 'tail -n 50 /var/log/gitlab/praefect/current'") do |line| + log = JSON.parse(line) + + log['grpc.method'] == 'ReplicateRepository' && log['virtual_storage'] == virtual_storage && log['relative_path'] == repo_path + rescue JSON::ParserError + # Ignore lines that can't be parsed as JSON + end + end + + def verify_storage_move_to_gitaly(repo_path, storage) + wait_until_shell_command("docker exec #{@gitlab} bash -c 'tail -n 50 /var/log/gitlab/gitaly/current'") do |line| + log = JSON.parse(line) + + log['grpc.method'] == 'ReplicateRepository' && log['grpc.request.repoStorage'] == storage && log['grpc.request.repoPath'] == repo_path + rescue JSON::ParserError + # Ignore lines that can't be parsed as JSON + end + end - line =~ regex + def with_praefect_log + wait_until_shell_command("docker exec #{@praefect} bash -c 'tail -n 1 /var/log/gitlab/praefect/current'") do |line| + QA::Runtime::Logger.debug(line.chomp) + yield JSON.parse(line) end end end diff --git a/qa/qa/service/shellout.rb b/qa/qa/service/shellout.rb index 6efe50c4ae2..81cfaa125a9 100644 --- a/qa/qa/service/shellout.rb +++ b/qa/qa/service/shellout.rb @@ -33,6 +33,31 @@ module QA end end end + + def sql_to_docker_exec_cmd(sql, username, password, database, host, container) + <<~CMD + docker exec --env PGPASSWORD=#{password} #{container} \ + bash -c "psql -U #{username} -d #{database} -h #{host} -c \\"#{sql}\\"" + CMD + end + + def wait_until_shell_command(cmd, **kwargs) + sleep_interval = kwargs.delete(:sleep_interval) || 1 + + Support::Waiter.wait_until(sleep_interval: sleep_interval, **kwargs) do + shell cmd do |line| + break true if yield line + end + end + end + + def wait_until_shell_command_matches(cmd, regex, **kwargs) + wait_until_shell_command(cmd, kwargs) do |line| + QA::Runtime::Logger.debug(line.chomp) + + line =~ regex + end + end end end end |