diff options
author | Oswaldo Ferreira <oswaldo@gitlab.com> | 2019-09-10 20:00:52 -0300 |
---|---|---|
committer | Oswaldo Ferreira <oswaldo@gitlab.com> | 2019-09-12 18:50:35 -0300 |
commit | 6834de923f4811a6aea0993baca2ed6f11c9e000 (patch) | |
tree | faaaaea14e59d4ce2da94b07f9fc92bf6a6c0c82 | |
parent | e8bb3c1ab7d6ddcaf7db9eeb5d701f97dff52cdc (diff) | |
download | gitlab-ce-28149-improve-seed.tar.gz |
-rw-r--r-- | Gemfile | 2 | ||||
-rw-r--r-- | Gemfile.lock | 2 | ||||
-rw-r--r-- | app/models/project.rb | 1 | ||||
-rw-r--r-- | db/fixtures/development/02_users.rb | 36 | ||||
-rw-r--r-- | db/fixtures/development/03_project.rb | 303 | ||||
-rw-r--r-- | db/fixtures/development/04_labels.rb | 53 | ||||
-rw-r--r-- | db/fixtures/development/04_projects.rb | 131 | ||||
-rw-r--r-- | lib/gitlab/seeder.rb | 29 |
8 files changed, 273 insertions, 284 deletions
@@ -322,7 +322,7 @@ group :development do gem 'letter_opener_web', '~> 1.3.4' gem 'rblineprof', '~> 0.3.6', platform: :mri, require: false - gem 'active_record-pg_generate_series', '~> 0.1.2' + gem 'active_record-pg_generate_series', '~> 0.1.3' # Better errors handler gem 'better_errors', '~> 2.5.0' diff --git a/Gemfile.lock b/Gemfile.lock index 3be7329b717..868824e6406 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1053,7 +1053,7 @@ DEPENDENCIES RedCloth (~> 4.3.2) ace-rails-ap (~> 4.1.0) acme-client (~> 2.0.2) - active_record-pg_generate_series (~> 0.1.2) + active_record-pg_generate_series (~> 0.1.3) activerecord-explain-analyze (~> 0.1) acts-as-taggable-on (~> 6.0) addressable (~> 2.5.2) diff --git a/app/models/project.rb b/app/models/project.rb index d948410e397..b6351e05792 100644 --- a/app/models/project.rb +++ b/app/models/project.rb @@ -31,6 +31,7 @@ class Project < ApplicationRecord include FeatureGate include OptionallySearch include FromUnion + include EachBatch extend Gitlab::Cache::RequestCache extend Gitlab::ConfigHelper diff --git a/db/fixtures/development/02_users.rb b/db/fixtures/development/02_users.rb index d5629459284..685b8aa385b 100644 --- a/db/fixtures/development/02_users.rb +++ b/db/fixtures/development/02_users.rb @@ -2,7 +2,7 @@ class Gitlab::Seeder::Users include ActionView::Helpers::NumberHelper RANDOM_USERS_COUNT = 20 - MASS_USERS_COUNT = 1_500_000 + MASS_USERS_COUNT = 1_000_000 attr_reader :opts @@ -40,14 +40,34 @@ class Gitlab::Seeder::Users def create_mass_users! encrypted_password = Devise::Encryptor.digest(User, '12345678') - User.insert_using_generate_series(1, MASS_USERS_COUNT, debug: true) do |sql| - sql.username = raw("'user' || seq") - sql.name = raw("'User ' || seq") - sql.email = raw("'user' || seq || '@example.com'") - sql.confirmed_at = raw("('1388530801'::timestamp + seq)::date") # 2014-01-01 - sql.encrypted_password = encrypted_password + Gitlab::Seeder.with_mass_insert(MASS_USERS_COUNT, User) do + User.insert_using_generate_series(1, MASS_USERS_COUNT) do |sql| + sql.username = raw("'seed_user' || seq") + sql.name = raw("'Seed user ' || seq") + sql.email = raw("'seed_user' || seq || '@example.com'") + sql.confirmed_at = raw("('2019-09-10'::date + seq)") + sql.projects_limit = 10_000_000 # no limit + sql.encrypted_password = encrypted_password + end + end + + # We can't use a sub-query here given we want to insert it just for the new + # namespaces. + Gitlab::Seeder.with_mass_insert(MASS_USERS_COUNT, Namespace, :batch) do + existing_namespaces = Namespace.pluck(:id) + User.where.not(id: existing_namespaces).find_in_batches(batch_size: 1_000) do |users| + rows = users.map do |user| + { + name: user.username, + path: user.username, + owner_id: user.id + } + end + + Gitlab::Database.bulk_insert('namespaces', rows) + print '.' + end end - puts "\n#{number_with_delimiter(MASS_USERS_COUNT)} users created!" end end diff --git a/db/fixtures/development/03_project.rb b/db/fixtures/development/03_project.rb index 46018cf68aa..9bc7559e500 100644 --- a/db/fixtures/development/03_project.rb +++ b/db/fixtures/development/03_project.rb @@ -1,137 +1,216 @@ require './spec/support/sidekiq' -# rubocop:disable Rails/Output - -Sidekiq::Testing.inline! do - Gitlab::Seeder.quiet do - Gitlab::Seeder.without_gitaly_timeout do - project_urls = %w[ - https://gitlab.com/gitlab-org/gitlab-test.git - https://gitlab.com/gitlab-org/gitlab-shell.git - https://gitlab.com/gnuwget/wget2.git - https://gitlab.com/Commit451/LabCoat.git - https://github.com/jashkenas/underscore.git - https://github.com/flightjs/flight.git - https://github.com/twitter/typeahead.js.git - https://github.com/h5bp/html5-boilerplate.git - https://github.com/google/material-design-lite.git - https://github.com/jlevy/the-art-of-command-line.git - https://github.com/FreeCodeCamp/freecodecamp.git - https://github.com/google/deepdream.git - https://github.com/jtleek/datasharing.git - https://github.com/WebAssembly/design.git - https://github.com/airbnb/javascript.git - https://github.com/tessalt/echo-chamber-js.git - https://github.com/atom/atom.git - https://github.com/mattermost/mattermost-server.git - https://github.com/purifycss/purifycss.git - https://github.com/facebook/nuclide.git - https://github.com/wbkd/awesome-d3.git - https://github.com/kilimchoi/engineering-blogs.git - https://github.com/gilbarbara/logos.git - https://github.com/reduxjs/redux.git - https://github.com/awslabs/s2n.git - https://github.com/arkency/reactjs_koans.git - https://github.com/twbs/bootstrap.git - https://github.com/chjj/ttystudio.git - https://github.com/MostlyAdequate/mostly-adequate-guide.git - https://github.com/octocat/Spoon-Knife.git - https://github.com/opencontainers/runc.git - https://github.com/googlesamples/android-topeka.git - ] - - large_project_urls = %w[ - https://github.com/torvalds/linux.git - https://gitlab.gnome.org/GNOME/gimp.git - https://gitlab.gnome.org/GNOME/gnome-mud.git - https://gitlab.com/fdroid/fdroidclient.git - https://gitlab.com/inkscape/inkscape.git - https://github.com/gnachman/iTerm2.git - ] - - def create_project(url, force_latest_storage: false) - group_path, project_path = url.split('/')[-2..-1] - - group = Group.find_by(path: group_path) - - unless group - group = Group.new( - name: group_path.titleize, - path: group_path - ) - group.description = FFaker::Lorem.sentence - group.save! - - group.add_owner(User.first) - end +class Gitlab::Seeder::Projects + include ActionView::Helpers::NumberHelper + + PROJECT_URLS = %w[ + https://gitlab.com/gitlab-org/gitlab-test.git + https://gitlab.com/gitlab-org/gitlab-shell.git + https://gitlab.com/gnuwget/wget2.git + https://gitlab.com/Commit451/LabCoat.git + https://github.com/jashkenas/underscore.git + https://github.com/flightjs/flight.git + https://github.com/twitter/typeahead.js.git + https://github.com/h5bp/html5-boilerplate.git + https://github.com/google/material-design-lite.git + https://github.com/jlevy/the-art-of-command-line.git + https://github.com/FreeCodeCamp/freecodecamp.git + https://github.com/google/deepdream.git + https://github.com/jtleek/datasharing.git + https://github.com/WebAssembly/design.git + https://github.com/airbnb/javascript.git + https://github.com/tessalt/echo-chamber-js.git + https://github.com/atom/atom.git + https://github.com/mattermost/mattermost-server.git + https://github.com/purifycss/purifycss.git + https://github.com/facebook/nuclide.git + https://github.com/wbkd/awesome-d3.git + https://github.com/kilimchoi/engineering-blogs.git + https://github.com/gilbarbara/logos.git + https://github.com/reduxjs/redux.git + https://github.com/awslabs/s2n.git + https://github.com/arkency/reactjs_koans.git + https://github.com/twbs/bootstrap.git + https://github.com/chjj/ttystudio.git + https://github.com/MostlyAdequate/mostly-adequate-guide.git + https://github.com/octocat/Spoon-Knife.git + https://github.com/opencontainers/runc.git + https://github.com/googlesamples/android-topeka.git + ] + LARGE_PROJECT_URLS = %w[ + https://github.com/torvalds/linux.git + https://gitlab.gnome.org/GNOME/gimp.git + https://gitlab.gnome.org/GNOME/gnome-mud.git + https://gitlab.com/fdroid/fdroidclient.git + https://gitlab.com/inkscape/inkscape.git + https://github.com/gnachman/iTerm2.git + ] + + # Consider altering MASS_USERS_COUNT for less + # users with projects. + MASS_PROJECTS_COUNT_PER_USER = { + private: 3, # 3m projects + + internal: 1, # 1m projects + + public: 1 # 1m projects = 5m total + } + + def seed! + Sidekiq::Testing.inline! do + create_real_projects! + create_mass_projects! + end + end - project_path.gsub!(".git", "") + private - params = { - import_url: url, - namespace_id: group.id, - name: project_path.titleize, - description: FFaker::Lorem.sentence, - visibility_level: Gitlab::VisibilityLevel.values.sample, - skip_disk_validation: true - } + def create_real_projects! + # You can specify how many projects you need during seed execution + size = ENV['SIZE'].present? ? ENV['SIZE'].to_i : 8 - if force_latest_storage - params[:storage_version] = Project::LATEST_STORAGE_VERSION - end + PROJECT_URLS.first(size).each_with_index do |url, i| + create_real_project!(url, force_latest_storage: i.even?) + end - project = nil + if ENV['LARGE_PROJECTS'].present? + LARGE_PROJECT_URLS.each(&method(:create_real_project!)) - Sidekiq::Worker.skipping_transaction_check do - project = Projects::CreateService.new(User.first, params).execute + if ENV['FORK'].present? + puts "\nGenerating forks" - # Seed-Fu runs this entire fixture in a transaction, so the `after_commit` - # hook won't run until after the fixture is loaded. That is too late - # since the Sidekiq::Testing block has already exited. Force clearing - # the `after_commit` queue to ensure the job is run now. - project.send(:_run_after_commit_queue) - project.import_state.send(:_run_after_commit_queue) - end + project_name = ENV['FORK'] == 'true' ? 'torvalds/linux' : ENV['FORK'] + + project = Project.find_by_full_path(project_name) - if project.valid? && project.valid_repo? - print '.' - else - puts project.errors.full_messages - print 'F' + User.offset(1).first(5).each do |user| + new_project = ::Projects::ForkService.new(project, user).execute + + if new_project.valid? && (new_project.valid_repo? || new_project.import_state.scheduled?) + print '.' + else + new_project.errors.full_messages.each do |error| + puts "#{new_project.full_path}: #{error}" + end + print 'F' + end end end + end + end - # You can specify how many projects you need during seed execution - size = ENV['SIZE'].present? ? ENV['SIZE'].to_i : 8 + def create_real_project!(url, force_latest_storage: false) + group_path, project_path = url.split('/')[-2..-1] - project_urls.first(size).each_with_index do |url, i| - create_project(url, force_latest_storage: i.even?) - end + group = Group.find_by(path: group_path) - if ENV['LARGE_PROJECTS'].present? - large_project_urls.each(&method(:create_project)) + unless group + group = Group.new( + name: group_path.titleize, + path: group_path + ) + group.description = FFaker::Lorem.sentence + group.save! - if ENV['FORK'].present? - puts "\nGenerating forks" + group.add_owner(User.first) + end + + project_path.gsub!(".git", "") - project_name = ENV['FORK'] == 'true' ? 'torvalds/linux' : ENV['FORK'] + params = { + import_url: url, + namespace_id: group.id, + name: project_path.titleize, + description: FFaker::Lorem.sentence, + visibility_level: Gitlab::VisibilityLevel.values.sample, + skip_disk_validation: true + } - project = Project.find_by_full_path(project_name) + if force_latest_storage + params[:storage_version] = Project::LATEST_STORAGE_VERSION + end - User.offset(1).first(5).each do |user| - new_project = Projects::ForkService.new(project, user).execute + project = nil - if new_project.valid? && (new_project.valid_repo? || new_project.import_state.scheduled?) - print '.' - else - new_project.errors.full_messages.each do |error| - puts "#{new_project.full_path}: #{error}" - end - print 'F' - end + Sidekiq::Worker.skipping_transaction_check do + project = ::Projects::CreateService.new(User.first, params).execute + + # Seed-Fu runs this entire fixture in a transaction, so the `after_commit` + # hook won't run until after the fixture is loaded. That is too late + # since the Sidekiq::Testing block has already exited. Force clearing + # the `after_commit` queue to ensure the job is run now. + project.send(:_run_after_commit_queue) + project.import_state&.send(:_run_after_commit_queue) + end + + if project.valid? && project.valid_repo? + print '.' + else + puts project.errors.full_messages + print 'F' + end + end + + def create_mass_projects_by_visility!(visibility) + # Let's leave the admin outside the mass insertion + relation = User.where.not(id: User.first).includes(:namespace) + row_count = MASS_PROJECTS_COUNT_PER_USER.fetch(visibility) + visibility_level = Gitlab::VisibilityLevel.level_value(visibility.to_s) + timestamp = Time.now + + Gitlab::Seeder.with_mass_insert(relation.count * row_count, Project, :batch) do + relation.find_in_batches(batch_size: 1_000) do |users| + # This will build an array with 3_000 rows at most for + # each iteration. + rows = users.flat_map do |user| + row_count.times.map do |t| + { + name: "Seed project #{t} (#{visibility})", + path: "seed_project_#{visibility}_#{t}", + creator_id: user.id, + namespace_id: user.namespace_id, + visibility_level: visibility_level, + created_at: timestamp, + updated_at: timestamp + } end end + + Gitlab::Database.bulk_insert('projects', rows) + print '.' end end end + + def create_mass_projects! + create_mass_projects_by_visility!(:private) + create_mass_projects_by_visility!(:internal) + create_mass_projects_by_visility!(:public) + create_mass_routes_for_projects! + end + + # TODO: Improve performance of this + def create_mass_routes_for_projects! + # All real projects are imports and already have route relations. + relation = Project.where(import_url: nil)#.includes(:namespace) + + Gitlab::Seeder.with_mass_insert(relation.count, Route, :batch) do + relation.each_batch do |projects| + rows = projects.includes(:namespace).map do |project| + { + source_id: project.id, + source_type: 'Project', + name: project.full_name, + path: project.full_path + } + end + + Gitlab::Database.bulk_insert('routes', rows) + print '.' + end + end + end +end + +Gitlab::Seeder.quiet do + projects = Gitlab::Seeder::Projects.new + projects.seed! end diff --git a/db/fixtures/development/04_labels.rb b/db/fixtures/development/04_labels.rb index f7d87b2b318..4d352858d50 100644 --- a/db/fixtures/development/04_labels.rb +++ b/db/fixtures/development/04_labels.rb @@ -10,47 +10,38 @@ class Gitlab::Seeder::GroupLabels end class Gitlab::Seeder::ProjectLabels - MASS_LABELS_COUNT = 400 # per project + include ActionView::Helpers::NumberHelper - def initialize(project) - @project = project - end + PROJECT_LIMIT = 500_000 + MASS_LABELS_COUNT = 50 # per project def seed! - Project.select(:id).find_in_batches(batch_size: 100) do |projects| - rows = projects.flat_map do |project| - MASS_LABELS_COUNT.times.map do - label_title = FFaker::Product.brand - - { - title: label_title, - color: "##{Digest::MD5.hexdigest(label_title)[0..5]}", - project_id: project.id, - type: 'ProjectLabel' - } + relation = Project.select(:id).limit(PROJECT_LIMIT) + total_labels = relation.count * MASS_LABELS_COUNT + + Gitlab::Seeder.with_mass_insert(total_labels, Label) do + relation.find_in_batches(batch_size: 500) do |projects| + rows = projects.flat_map do |project| + MASS_LABELS_COUNT.times.map do + label_title = FFaker::Product.brand + + { + title: label_title, + color: "##{Digest::MD5.hexdigest(label_title)[0..5]}", + project_id: project.id, + type: 'ProjectLabel' + } + end end - end - - puts "*** ROWS BEING INSERTED ***" - p rows.size - - print rows.size * '.' - Gitlab::Database.bulk_insert('labels', rows) - puts "*** LABELS COUNT ***" - print Label.count + Gitlab::Database.bulk_insert('labels', rows) + print '.' + end end - - puts "#{number_with_delimiter(MASS_LABELS_COUNT)} project labels created for each project!" end end Gitlab::Seeder.quiet do - # puts "\nGenerating group labels" - # group_labels = Gitlab::Seeder::GroupLabels.new - # group_labels.seed! - - puts "\nGenerating project labels" project_labels = Gitlab::Seeder::ProjectLabels.new project_labels.seed! end diff --git a/db/fixtures/development/04_projects.rb b/db/fixtures/development/04_projects.rb deleted file mode 100644 index 9f551ee7033..00000000000 --- a/db/fixtures/development/04_projects.rb +++ /dev/null @@ -1,131 +0,0 @@ -require './spec/support/sidekiq' - -class Gitlab::Seeder::Projects - include ActionView::Helpers::NumberHelper - - PROJECT_URLS = [ - 'https://gitlab.com/gitlab-org/gitlab-test.git', - 'https://gitlab.com/gitlab-org/gitlab-ce.git', - 'https://gitlab.com/gitlab-org/gitlab-ci.git', - 'https://gitlab.com/gitlab-org/gitlab-shell.git', - 'https://github.com/documentcloud/underscore.git', - 'https://github.com/twitter/flight.git', - 'https://github.com/twitter/typeahead.js.git', - 'https://github.com/h5bp/html5-boilerplate.git', - 'https://github.com/google/material-design-lite.git', - 'https://github.com/jlevy/the-art-of-command-line.git', - 'https://github.com/FreeCodeCamp/freecodecamp.git', - 'https://github.com/google/deepdream.git', - 'https://github.com/jtleek/datasharing.git', - 'https://github.com/WebAssembly/design.git', - 'https://github.com/airbnb/javascript.git', - 'https://github.com/tessalt/echo-chamber-js.git', - 'https://github.com/atom/atom.git', - 'https://github.com/mattermost/platform.git', - 'https://github.com/purifycss/purifycss.git', - 'https://github.com/facebook/nuclide.git', - 'https://github.com/wbkd/awesome-d3.git', - 'https://github.com/kilimchoi/engineering-blogs.git', - 'https://github.com/gilbarbara/logos.git', - 'https://github.com/gaearon/redux.git', - 'https://github.com/awslabs/s2n.git', - 'https://github.com/arkency/reactjs_koans.git', - 'https://github.com/twbs/bootstrap.git', - 'https://github.com/chjj/ttystudio.git', - 'https://github.com/DrBoolean/mostly-adequate-guide.git', - 'https://github.com/octocat/Spoon-Knife.git', - 'https://github.com/opencontainers/runc.git', - 'https://github.com/googlesamples/android-topeka.git' - ] - MASS_PROJECTS_COUNT = { - private: 2_000_000, - internal: 30_000, - public: 265_000 - } - - attr_reader :opts - - def initialize(opts = {}) - @opts = opts - end - - def seed! - Sidekiq::Testing.inline! do - create_real_projects!(opts[:count]) - create_mass_projects! - end - end - - private - - def create_real_projects!(count) - PROJECT_URLS.first(count).each_with_index do |url, i| - group_path, project_path = url.split('/')[-2..-1] - - group = Group.find_by(path: group_path) - - unless group - group = Group.new( - name: group_path.titleize, - path: group_path - ) - group.description = FFaker::Lorem.sentence - group.save - - group.add_owner(User.first) - end - - project_path.gsub!(".git", "") - - params = { - import_url: url, - namespace_id: group.id, - name: project_path.titleize, - description: FFaker::Lorem.sentence, - visibility_level: Gitlab::VisibilityLevel.values.sample - } - - project = ::Projects::CreateService.new(User.first, params).execute - # Seed-Fu runs this entire fixture in a transaction, so the `after_commit` - # hook won't run until after the fixture is loaded. That is too late - # since the Sidekiq::Testing block has already exited. Force clearing - # the `after_commit` queue to ensure the job is run now. - project.send(:_run_after_commit_queue) - - if project.valid? && project.valid_repo? - print '.' - else - puts project.errors.full_messages - print 'F' - end - end - end - - def create_mass_projects! - create_mass_projects_by_visility!(:private) - create_mass_projects_by_visility!(:internal) - create_mass_projects_by_visility!(:public) - end - - def create_mass_projects_by_visility!(visibility) - users = User.limit(100) - groups = Group.limit(100) - namespaces = users + groups - Project.insert_using_generate_series(1, MASS_PROJECTS_COUNT[visibility], debug: true) do |sql| - project_name = raw("'seed_#{visibility}_project_' || seq") - namespace = namespaces.take - sql.name = project_name - sql.path = project_name - sql.creator_id = namespace.is_a?(Group) ? namespace.owner_id : users.take.id - sql.namespace_id = namespace.is_a?(Group) ? namespace.id : namespace.namespace_id - sql.visibility_level = Gitlab::VisibilityLevel.level_value(visibility.to_s) - end - puts "#{number_with_delimiter(MASS_PROJECTS_COUNT[visibility])} projects created!" - end -end - -Gitlab::Seeder.quiet do - count = ENV['SIZE'].present? ? ENV['SIZE'].to_i : 8 - projects = Gitlab::Seeder::Projects.new(count: count) - projects.seed! -end diff --git a/lib/gitlab/seeder.rb b/lib/gitlab/seeder.rb index 15c6caba63d..c6f2b4934e3 100644 --- a/lib/gitlab/seeder.rb +++ b/lib/gitlab/seeder.rb @@ -14,6 +14,35 @@ end module Gitlab class Seeder + extend ActionView::Helpers::NumberHelper + + ESTIMATED_INSERT_PER_MINUTE = 2_000_000 + + def self.with_mass_insert(size, model, strategy = :series) + humanized_size = number_with_delimiter(size) + humanized_model_name = model.model_name.human.pluralize(size) + estimative = humanized_insert_time_message(size, strategy) + + puts "\nCreating #{humanized_size} #{humanized_model_name} (#{strategy} strategy)." + puts estimative + + yield + + puts "\n#{number_with_delimiter(size)} #{humanized_model_name} created!" + end + + def self.humanized_insert_time_message(size, strategy) + estimated_minutes = (size.to_f / ESTIMATED_INSERT_PER_MINUTE).round + estimated_minutes = estimated_minutes * 3 if strategy == :batch + humanized_minutes = 'minute'.pluralize(estimated_minutes) + + if estimated_minutes.zero? + "Estimated time: less than a minute ⏰" + else + "Estimated time: #{estimated_minutes} #{humanized_minutes} ⏰" + end + end + def self.quiet # Disable database insertion logs so speed isn't limited by ability to print to console old_logger = ActiveRecord::Base.logger |