Machine learning algorithms in C++
NaiveBayes.hpp
Go to the documentation of this file.
1 
7 #ifndef MACHINE_LEARNING_NAIVEBAYES_HPP
8 #define MACHINE_LEARNING_NAIVEBAYES_HPP
9 
10 #include <string>
11 #include <map>
12 #include "../include/matrix/Matrix.hpp"
13 
14 using namespace std;
15 
19 class NaiveBayes {
20  private:
21  MatrixI lookupTable, yFrequency;
22  vector<string> lookupColumns, lookupRows;
23  public:
24 
31  explicit NaiveBayes(const string &csvPath, bool verbose = true) {
32  lookupColumns = vector<string>(), lookupRows = vector<string>();
33 
34  vector<vector<string>> data = CSVReader::csvToStringVecVec(csvPath, true);
35  vector<string> csvHeader = data[0];
36  data.erase(data.begin());
37 
38  for (vector<string> csvRow:data) {
39  for (int i = 0; i < csvRow.size() - 1; i++)
40  lookupRows.push_back(csvHeader[i] + '+' + csvRow[i]);
41 
42  lookupColumns.push_back(csvHeader[csvHeader.size() - 1] + '+'
43  + csvRow[csvHeader.size() - 1]);
44  }
45 
46  sort(lookupRows.begin(), lookupRows.end());
47  sort(lookupColumns.begin(), lookupColumns.end());
48  lookupRows.erase(unique(lookupRows.begin(), lookupRows.end()), lookupRows.end());
49  lookupColumns.erase(unique(lookupColumns.begin(), lookupColumns.end()), lookupColumns.end());
50 
51  lookupTable = MatrixI::zeros(lookupRows.size(),
52  lookupColumns.size());
53  yFrequency = MatrixI::zeros(lookupColumns.size(), 1);
54 
55  for (vector<string> csvRow:data) {
56  for (int i = 0; i < csvRow.size() - 1; i++) {
57  string rowElement = csvHeader[i] + '+' + csvRow[i],
58  relevantHeader = csvHeader[csvHeader.size() - 1] + '+' + csvRow[csvHeader.size() - 1];
59  size_t row = static_cast<size_t>(distance(lookupRows.begin(),
60  find(lookupRows.begin(), lookupRows.end(), rowElement)));
61  size_t col = static_cast<size_t>(distance(lookupColumns.begin(),
62  find(lookupColumns.begin(), lookupColumns.end(), relevantHeader)));
63 
64  lookupTable(row, col) += 1;
65  yFrequency(col, 0) += 1;
66  }
67  }
68 
69  if (verbose) {
70  cout << "Lookup table:" << endl << lookupTable << endl << "Rows:" << endl;
71  for (auto s : lookupRows)
72  cout << s << '\t';
73  cout << endl << "Columns:" << endl;
74 
75  for (auto s : lookupColumns)
76  cout << s << '\t';
77  cout << endl;
78 
79  cout << "Class frequency:" << endl << yFrequency;
80  }
81  }
82 
90  vector<string> predict(vector<vector<string>> data, bool verbose = true) {
91  vector<string> csvHeader = data[0];
92  data.erase(data.begin());
93  vector<string> result(data.size());
94  MatrixD probabilities = MatrixD::ones(data.size(), lookupColumns.size());
95 
96  // for each line in our test dataset...
97  #pragma omp parallel for if(data.size() > 500)
98  for (size_t i = 0; i < data.size(); i++) {
99  vector<string> csvRow = data[i];
100 
101  // for each feature in the current row...
102  for (size_t j = 0; j < csvRow.size(); j++) {
103  string rowElement = csvHeader[j] + '+' + csvRow[j];
104  size_t row = static_cast<size_t>(distance(lookupRows.begin(),
105  find(lookupRows.begin(), lookupRows.end(), rowElement)));
106 
107  // for each possible outcome...
108  for (size_t col = 0; col < lookupColumns.size(); col++) {
109  int lookup = lookupTable(row, col), yFreq = yFrequency(col, 0);
110  double currentFrequency = (double) lookupTable(row, col) / yFrequency(col, 0);
111  probabilities(i, col) *= currentFrequency;
112  }
113  }
114 
115  int maxProbIndex = -1;
116  double currentMaxProb = 0, probSum = 0;
117  for (size_t j = 0; j < lookupColumns.size(); j++) {
118  probSum += probabilities(i, j);
119  if (probabilities(i, j) > currentMaxProb) {
120  currentMaxProb = probabilities(i, j);
121  maxProbIndex = static_cast<int>(j);
122  }
123  }
124 
125  // normalize probabilities so their sum equals 1
126  for (size_t j = 0; j < lookupColumns.size(); j++)
127  probabilities(i, j) /= probSum;
128 
129  result[i] = maxProbIndex != -1 ? lookupColumns[maxProbIndex] : "NaN";
130  }
131 
132  if (verbose)
133  cout << "Probabilities:" << endl << probabilities;
134 
135  return result;
136  }
137 };
138 
139 #endif //MACHINE_LEARNING_NAIVEBAYES_HPP
vector< string > lookupRows
Definition: NaiveBayes.hpp:22
NaiveBayes(const string &csvPath, bool verbose=true)
Naive Bayes classifier.
Definition: NaiveBayes.hpp:31
k-nearest neighbors algorithm, able to do regression and classification
MatrixI yFrequency
Definition: NaiveBayes.hpp:21
Naive Bayes classifier.
Definition: NaiveBayes.hpp:19
vector< string > predict(vector< vector< string >> data, bool verbose=true)
Predict the classes of new data.
Definition: NaiveBayes.hpp:90