Machine learning algorithms in C++
KNN.hpp
Go to the documentation of this file.
1 
7 #ifndef MACHINE_LEARNING_KNN_HPP
8 #define MACHINE_LEARNING_KNN_HPP
9 
10 #include <utility>
11 #include <vector>
12 #include <algorithm>
13 #include <cmath>
14 #include <omp.h>
15 #include <chrono>
16 
17 using namespace std;
18 
22 class KNN {
23  public:
24  enum Distance { HAMMING, EUCLIDEAN };
25  private:
26  vector<vector<double>> data;
27  int yColumn, k;
28 
30 
34  void getKNN(int *chosen_indices, const vector<double> &testie) {
35 
36  // initialize two arrays with indices of the k nearest neighbors in our data
37  // and their distances from our new element
38  double chosen_distances[k];
39 
40  int largestDistanceIndex = 0;
41  double largestCandidateDistance = distance == EUCLIDEAN ? euclidean(testie, data[0]) : hamming(testie, data[0]);;
42  for (int i = 0; i < data.size(); i++) {
43  // get the euclidean euclidean from the new element to the current element in our dataset
44  double current_distance = distance == EUCLIDEAN ? euclidean(testie, data[i]) : hamming(testie, data[i]);
45 
46  // first, fill the array with the first k candidates
47  if (i < k) {
48  chosen_indices[i] = i;
49  chosen_distances[i] = current_distance;
50 
51  if (current_distance >= largestCandidateDistance) {
52  largestDistanceIndex = i;
53  largestCandidateDistance = current_distance;
54  }
55  }
56  // then, look for the furthest neighbor candidate and check if ours is closer
57  else if (current_distance < largestCandidateDistance) {
58 
59  //if it is, replace the furthest one with the current one
60  chosen_indices[largestDistanceIndex] = i;
61  chosen_distances[largestDistanceIndex] = current_distance;
62 
63  // look, in the k candidates, the furthest one
64  largestDistanceIndex = 0;
65  largestCandidateDistance = chosen_distances[0];
66  for (int j = 1; j < k; j++) {
67  if (largestCandidateDistance < chosen_distances[j]) {
68  largestDistanceIndex = j;
69  largestCandidateDistance = chosen_distances[j];
70  }
71  }
72  }
73  }
74  }
75 
76  public:
77 
78  int getK() const {
79  return k;
80  }
81 
82  void setK(int k) {
83  KNN::k = k;
84  }
85 
86  const vector<vector<double>> &getData() const {
87  return data;
88  }
89 
90  int getYColumn() const {
91  return yColumn;
92  }
93 
98  explicit KNN(vector<vector<double>> data, int yColumn, int k = 1, Distance distance = EUCLIDEAN) {
99  this->data = std::move(data);
100 
101  sort(this->data.begin(), this->data.end(), [](const vector<double> &a, const vector<double> &b) {
102  for (int i = 0; i < a.size(); i++) {
103  if (a[i] < b[i])
104  return true;
105  if (a[i] > b[i])
106  return false;
107  }
108  return false;
109  });
110 
111  this->yColumn = yColumn;
112  this->k = k;
113  this->distance = distance;
114  }
115 
120  double euclidean(vector<double> a, vector<double> b) {
121  double d = 0;
122  for (int i = 0; i < a.size(); i++) {
123  // ignore our dependent variable column
124  if (i == yColumn)
125  continue;
126 
127  d += pow(a[i] - b[i], 2);
128  }
129  return sqrt(d);
130  }
131 
136  double hamming(vector<double> a, vector<double> b) {
137  double d = 0;
138  for (int i = 0; i < a.size(); i++) {
139  // ignore our dependent variable column
140  if (i == yColumn)
141  continue;
142 
143  d += a[i] != b[i];
144  }
145  return d;
146  }
147 
151  double regression(const vector<double> &testie) {
152  // get the indices of the k-nearest neighbors
153  int chosen_indices[k];
154  getKNN(chosen_indices, testie);
155 
156  // sum the y column
157  double ySum = 0;
158 
159  #pragma omp parallel for reduction(+:ySum)
160  for (int j = 0; j < k; j++) {
161  ySum += data[chosen_indices[j]][yColumn];
162  }
163 
164  // return its mean value
165  return ySum / k;
166  }
167 
168  double classify(const vector<double> &testie) {
169  // get the indices of the k-nearest neighbors
170  int chosen_indices[k];
171  getKNN(chosen_indices, testie);
172 
173  // initialize vectors that will hold individual classes and their number of appearances in the knn
174  int class_votes[k] = {0};
175  vector<double> classes;
176 
177  // sum the occurrences of each class
178  class_votes[0] = 1;
179  classes.push_back(data[chosen_indices[0]][yColumn]);
180  for (int j = 1; j < k; j++) {
181  double current_class = data[chosen_indices[j]][yColumn];
182  for (int i = 0; i < classes.size(); i++) {
183  if (classes[i] == current_class) {
184  class_votes[i]++;
185  break;
186  }
187  }
188  }
189 
190  // get the class with the most votes
191  double winner = classes[0];
192  int winner_votes = class_votes[0];
193  for (int j = 1; j < k; j++) {
194  if (class_votes[j] > winner_votes) {
195  winner = classes[j];
196  winner_votes = class_votes[j];
197  }
198  }
199 
200  return winner;
201  }
202 
203  vector<double> classify(const vector<vector<double>> &test, bool verbose = false) {
204  vector<double> y;
205  unsigned long totalSize = test.size();
206 
207  using clock = chrono::high_resolution_clock;
208  chrono::time_point<chrono::system_clock> start = clock::now();
209 
210  for (int i = 0; i < test.size(); i++) {
211  if (verbose and i % 100 == 0) {
212  float tempo = ((chrono::duration<float>) (clock::now() - start)).count();
213  float media = tempo / (i + 1);
214  float total = media * totalSize;
215  cout << (total - tempo) / 60 << endl;
216  }
217  y.push_back(classify(test[i]));
218  }
219 
220  return y;
221  }
222 
223  vector<double> regression(const vector<vector<double>> &test, bool verbose = false) {
224  vector<double> y;
225  unsigned long totalSize = test.size();
226 
227  for (int i = 0; i < test.size(); i++) {
228  if (verbose and i % (totalSize / 100) == 0)
229  cout << totalSize / (i + 1);
230  y.push_back(regression(test[i]));
231  }
232 
233  return y;
234  }
235 
237  return distance;
238  }
239 
240  void setDistance(Distance distance) {
241  KNN::distance = distance;
242  }
243 };
244 
245 #endif
double euclidean(vector< double > a, vector< double > b)
Calculates the Euclidean distance between two vectors.
Definition: KNN.hpp:120
k-nearest neighbors algorithm
Definition: KNN.hpp:22
int yColumn
Definition: KNN.hpp:27
double hamming(vector< double > a, vector< double > b)
Calculates the Hamming distance between two vectors.
Definition: KNN.hpp:136
vector< double > regression(const vector< vector< double >> &test, bool verbose=false)
Definition: KNN.hpp:223
int getYColumn() const
Definition: KNN.hpp:90
k-nearest neighbors algorithm, able to do regression and classification
Distance
Definition: KNN.hpp:24
void getKNN(int *chosen_indices, const vector< double > &testie)
Finds the k-nearest neighbors of a data element.
Definition: KNN.hpp:34
vector< double > classify(const vector< vector< double >> &test, bool verbose=false)
Definition: KNN.hpp:203
const vector< vector< double > > & getData() const
Definition: KNN.hpp:86
void setDistance(Distance distance)
Definition: KNN.hpp:240
double regression(const vector< double > &testie)
Perform regression using.
Definition: KNN.hpp:151
void setK(int k)
Definition: KNN.hpp:82
double classify(const vector< double > &testie)
Definition: KNN.hpp:168
int getK() const
Definition: KNN.hpp:78
KNN(vector< vector< double >> data, int yColumn, int k=1, Distance distance=EUCLIDEAN)
k-nearest neighbors algorithm, able to do regression and classification
Definition: KNN.hpp:98
Distance distance
Definition: KNN.hpp:29
vector< vector< double > > data
Definition: KNN.hpp:26
Distance getDistance() const
Definition: KNN.hpp:236
int k
Definition: KNN.hpp:27