summaryrefslogtreecommitdiff
path: root/chromium/components/federated_learning/sim_hash.cc
blob: 8012adfbbe264e436e5936039bcff8afde357ad7 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "components/federated_learning/sim_hash.h"

#include "base/hash/legacy_hash.h"

#include <algorithm>
#include <cmath>

namespace federated_learning {

namespace {

uint64_t g_seed1 = 1ULL;
uint64_t g_seed2 = 2ULL;
constexpr double kTwoPi = 2.0 * 3.141592653589793;

// Hashes i and j to create a uniform RV in [0,1].
double RandomUniform(uint64_t i, uint64_t j, uint64_t seed) {
  uint64_t arr[2] = {i, j};
  uint64_t hashed = base::legacy::CityHash64WithSeed(
      base::as_bytes(
          base::make_span(reinterpret_cast<const char*>(arr), sizeof(arr))),
      seed);

  return static_cast<double>(hashed) /
         static_cast<double>(std::numeric_limits<uint64_t>::max());
}

// Hashes i and j to create two uniform RVs in [0,1]. Uses the Box-Muller
// transform to generate a Gaussian.
double RandomGaussian(uint64_t i, uint64_t j) {
  double rv1 = RandomUniform(i, j, g_seed1);
  double rv2 = RandomUniform(j, i, g_seed2);

  // BoxMuller
  return std::sqrt(-2.0 * std::log(rv1)) * std::cos(kTwoPi * rv2);
}

}  // namespace

LargeBitVector::LargeBitVector() = default;
LargeBitVector::LargeBitVector(const LargeBitVector&) = default;
LargeBitVector::~LargeBitVector() = default;

void LargeBitVector::SetBit(uint64_t pos) {
  positions_of_set_bits_.insert(pos);
}

const std::set<uint64_t>& LargeBitVector::PositionsOfSetBits() const {
  return positions_of_set_bits_;
}

void SetSeedsForTesting(uint64_t seed1, uint64_t seed2) {
  g_seed1 = seed1;
  g_seed2 = seed2;
}

uint64_t SimHashBits(const LargeBitVector& input, size_t output_dimensions) {
  DCHECK_LT(0u, output_dimensions);
  DCHECK_LE(output_dimensions, 64u);

  uint64_t result = 0;
  for (size_t d = 0; d < output_dimensions; ++d) {
    double acc = 0;

    for (uint64_t pos : input.PositionsOfSetBits())
      acc += RandomGaussian(d, pos);

    if (acc > 0)
      result |= (1ULL << d);
  }

  return result;
}

uint64_t SimHashStrings(const std::unordered_set<std::string>& input,
                        size_t output_dimensions) {
  DCHECK_LT(0u, output_dimensions);
  DCHECK_LE(output_dimensions, 64u);

  LargeBitVector large_bit_vector;
  for (const std::string& s : input) {
    large_bit_vector.SetBit(
        base::legacy::CityHash64(base::as_bytes(base::make_span(s))));
  }

  return SimHashBits(large_bit_vector, output_dimensions);
}

}  // namespace federated_learning