summaryrefslogtreecommitdiff
path: root/qa/qa/service/praefect_manager.rb
diff options
context:
space:
mode:
Diffstat (limited to 'qa/qa/service/praefect_manager.rb')
-rw-r--r--qa/qa/service/praefect_manager.rb157
1 files changed, 78 insertions, 79 deletions
diff --git a/qa/qa/service/praefect_manager.rb b/qa/qa/service/praefect_manager.rb
index dd4cce5d0b0..7e47049d446 100644
--- a/qa/qa/service/praefect_manager.rb
+++ b/qa/qa/service/praefect_manager.rb
@@ -19,7 +19,7 @@ module QA
@virtual_storage = 'default'
end
- attr_reader :primary_node, :secondary_node, :tertiary_node
+ attr_reader :primary_node, :secondary_node, :tertiary_node, :postgres
# Executes the praefect `dataloss` command.
#
@@ -83,19 +83,19 @@ module QA
def start_node(name)
shell "docker start #{name}"
+ end
+
+ def stop_node(name)
+ shell "docker stop #{name}"
wait_until_shell_command_matches(
"docker inspect -f {{.State.Running}} #{name}",
- /true/,
+ /false/,
sleep_interval: 3,
max_duration: 180,
retry_on_exception: true
)
end
- def stop_node(name)
- shell "docker stop #{name}"
- end
-
def clear_replication_queue
QA::Runtime::Logger.info("Clearing the replication queue")
shell sql_to_docker_exec_cmd(
@@ -174,13 +174,13 @@ module QA
end
def start_all_nodes
+ start_node(@postgres)
start_node(@primary_node)
start_node(@secondary_node)
start_node(@tertiary_node)
start_node(@praefect)
wait_for_health_check_all_nodes
- wait_for_reliable_connection
end
def verify_storage_move(source_storage, destination_storage, repo_type: :project)
@@ -192,21 +192,23 @@ module QA
end
def wait_for_praefect
- wait_until_shell_command_matches(
- "docker inspect -f {{.State.Running}} #{@praefect}",
- /true/,
- sleep_interval: 3,
- max_duration: 180,
- retry_on_exception: true
- )
+ QA::Runtime::Logger.info("Waiting for health check on praefect")
+ Support::Waiter.wait_until(max_duration: 120, sleep_interval: 1, raise_on_failure: true) do
+ # praefect runs a grpc server on port 2305, which will return an error 'Connection refused' until such time it is ready
+ wait_until_shell_command("docker exec #{@gitaly_cluster} bash -c 'curl #{@praefect}:2305'") do |line|
+ break if line.include?('curl: (1) Received HTTP/0.9 when not allowed')
- QA::Runtime::Logger.info('Wait until Praefect starts and is listening')
- wait_until_shell_command_matches(
- "docker exec #{@praefect} bash -c 'cat /var/log/gitlab/praefect/current'",
- /listening at tcp address/
- )
+ QA::Runtime::Logger.debug(line.chomp)
+ end
+ end
+ end
- wait_for_gitaly_check
+ def praefect_sql_ping_healthy?
+ cmd = "docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml sql-ping'"
+ wait_until_shell_command(cmd) do |line|
+ QA::Runtime::Logger.debug(line.chomp)
+ break line.include?('praefect sql-ping: OK')
+ end
end
def wait_for_sql_ping
@@ -220,32 +222,7 @@ module QA
['error when pinging healthcheck', 'failed checking node health'].include?(msg)
end
- def wait_for_no_praefect_storage_error
- # If a healthcheck error was the last message to be logged, we'll keep seeing that message even if it's no longer a problem
- # That is, there's no message shown in the Praefect logs when the healthcheck succeeds
- # To work around that we perform the gitaly check rake task, wait a few seconds, and then we confirm that no healthcheck errors appear
-
- QA::Runtime::Logger.info("Checking that Praefect does not report healthcheck errors with its gitaly nodes")
-
- Support::Waiter.wait_until(max_duration: 120) do
- wait_for_gitaly_check
-
- sleep 5
-
- shell "docker exec #{@praefect} bash -c 'tail -n 1 /var/log/gitlab/praefect/current'" do |line|
- QA::Runtime::Logger.debug(line.chomp)
- log = JSON.parse(line)
-
- break true unless health_check_failure_message?(log['msg'])
- rescue JSON::ParserError
- # Ignore lines that can't be parsed as JSON
- end
- end
- end
-
- def wait_for_storage_nodes
- wait_for_no_praefect_storage_error
-
+ def wait_for_dial_nodes_successful
Support::Waiter.repeat_until(max_attempts: 3, max_duration: 120, sleep_interval: 1) do
nodes_confirmed = {
@primary_node => false,
@@ -253,39 +230,55 @@ module QA
@tertiary_node => false
}
- wait_until_shell_command("docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml dial-nodes'") do |line|
- QA::Runtime::Logger.debug(line.chomp)
+ nodes_confirmed.each_key do |node|
+ nodes_confirmed[node] = true if praefect_dial_nodes_status?(node)
+ end
- nodes_confirmed.each_key do |node|
- nodes_confirmed[node] = true if line =~ /SUCCESS: confirmed Gitaly storage "#{node}" in virtual storages \[#{@virtual_storage}\] is served/
- end
+ nodes_confirmed.values.all?
+ end
+ end
- nodes_confirmed.values.all?
+ def praefect_dial_nodes_status?(node, expect_healthy = true)
+ cmd = "docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml dial-nodes -timeout 1s'"
+ if expect_healthy
+ wait_until_shell_command_matches(cmd, /SUCCESS: confirmed Gitaly storage "#{node}" in virtual storages \[#{@virtual_storage}\] is served/)
+ else
+ wait_until_shell_command(cmd, raise_on_failure: false) do |line|
+ QA::Runtime::Logger.debug(line.chomp)
+ break true if line.include?('the following nodes are not healthy') && line.include?(node)
end
end
end
def wait_for_health_check_all_nodes
- wait_for_health_check(@primary_node)
- wait_for_health_check(@secondary_node)
- wait_for_health_check(@tertiary_node)
+ wait_for_gitaly_health_check(@primary_node)
+ wait_for_gitaly_health_check(@secondary_node)
+ wait_for_gitaly_health_check(@tertiary_node)
end
- def wait_for_health_check(node)
+ def wait_for_gitaly_health_check(node)
QA::Runtime::Logger.info("Waiting for health check on #{node}")
+ Support::Waiter.wait_until(max_duration: 120, sleep_interval: 1, raise_on_failure: true) do
+ # gitaly runs a grpc server on port 8075, which will return an error 'Connection refused' until such time it is ready
+ wait_until_shell_command("docker exec #{@praefect} bash -c 'curl #{node}:8075'") do |line|
+ break if line.include?('curl: (1) Received HTTP/0.9 when not allowed')
+
+ QA::Runtime::Logger.debug(line.chomp)
+ end
+ end
wait_until_node_is_marked_as_healthy_storage(node)
end
def wait_for_primary_node_health_check
- wait_for_health_check(@primary_node)
+ wait_for_gitaly_health_check(@primary_node)
end
def wait_for_secondary_node_health_check
- wait_for_health_check(@secondary_node)
+ wait_for_gitaly_health_check(@secondary_node)
end
def wait_for_tertiary_node_health_check
- wait_for_health_check(@tertiary_node)
+ wait_for_gitaly_health_check(@tertiary_node)
end
def wait_for_health_check_failure(node)
@@ -311,7 +304,6 @@ module QA
shell sql_to_docker_exec_cmd("SELECT count(*) FROM healthy_storages WHERE storage = '#{node}';") do |line|
result << line
end
- QA::Runtime::Logger.debug("result is ---#{result}")
result[2].to_i == 0
end
end
@@ -322,21 +314,10 @@ module QA
shell sql_to_docker_exec_cmd("SELECT count(*) FROM healthy_storages WHERE storage = '#{node}';") do |line|
result << line
end
-
- QA::Runtime::Logger.debug("result is ---#{result}")
result[2].to_i == 1
end
end
- def wait_for_gitaly_check
- Support::Waiter.wait_until(max_duration: 120, sleep_interval: 1, raise_on_failure: true) do
- wait_until_shell_command("docker exec #{@gitlab} bash -c 'gitlab-rake gitlab:git:fsck'") do |line|
- QA::Runtime::Logger.debug(line.chomp)
- line.include?('Done')
- end
- end
- end
-
# Waits until there is an increase in the number of reads for
# any node compared to the number of reads provided. If a node
# has no pre-read data, consider it to have had zero reads.
@@ -354,12 +335,6 @@ module QA
data.find(-> {{ value: 0 }}) { |item| item[:node] == node }[:value]
end
- def wait_for_reliable_connection
- QA::Runtime::Logger.info('Wait until GitLab and Praefect can communicate reliably')
- wait_for_sql_ping
- wait_for_storage_nodes
- end
-
def wait_for_replication(project_id)
Support::Waiter.wait_until(sleep_interval: 1) { replication_queue_incomplete_count == 0 && replicated?(project_id) }
end
@@ -414,10 +389,34 @@ module QA
end
def remove_tracked_praefect_repository(relative_path, virtual_storage)
- cmd = "gitlab-ctl praefect remove-repository --repository-relative-path #{relative_path} --virtual-storage-name #{virtual_storage}"
+ cmd = "gitlab-ctl praefect remove-repository --repository-relative-path #{relative_path} --virtual-storage-name #{virtual_storage} --apply"
shell "docker exec #{@praefect} bash -c '#{cmd}'"
end
+ # set_replication_factor assigns or unassigns random storage nodes as necessary to reach the desired replication factor for a repository
+ def set_replication_factor(relative_path, virtual_storage, factor)
+ cmd = "/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml set-replication-factor -repository #{relative_path} -virtual-storage #{virtual_storage} -replication-factor #{factor}"
+ shell "docker exec #{@praefect} bash -c '#{cmd}'"
+ end
+
+ # get_replication_storages retrieves a list of currently assigned storages for a repository
+ def get_replication_storages(relative_path, virtual_storage)
+ storage_repositories = []
+ query = "SELECT storage FROM repository_assignments WHERE relative_path='#{relative_path}' AND virtual_storage='#{virtual_storage}';"
+ shell(sql_to_docker_exec_cmd(query)) { |line| storage_repositories << line.strip }
+ # Returned data from query will be in format
+ # storage
+ # --------
+ # gitaly1
+ # gitaly3
+ # gitaly2
+ # (3 rows)
+ #
+
+ # remove 2 header rows and last 2 rows from query response (including blank line)
+ storage_repositories[2..-3]
+ end
+
def add_repo_to_disk(node, repo_path)
cmd = "GIT_DIR=. git init --initial-branch=main /var/opt/gitlab/git-data/repositories/#{repo_path}"
shell "docker exec --user git #{node} bash -c '#{cmd}'"
@@ -452,7 +451,7 @@ module QA
end
def repository_replicated_to_disk?(node, relative_path)
- Support::Waiter.wait_until(max_duration: 300, sleep_interval: 3, raise_on_failure: false) do
+ Support::Waiter.wait_until(max_duration: 300, sleep_interval: 1, raise_on_failure: false) do
result = []
shell sql_to_docker_exec_cmd("SELECT count(*) FROM storage_repositories where relative_path='#{relative_path}';") do |line|
result << line