7 #ifndef MACHINE_LEARNING_NAIVEBAYES_HPP 8 #define MACHINE_LEARNING_NAIVEBAYES_HPP 12 #include "../include/matrix/Matrix.hpp" 31 explicit NaiveBayes(
const string &csvPath,
bool verbose =
true) {
32 lookupColumns = vector<string>(), lookupRows = vector<string>();
34 vector<vector<string>> data = CSVReader::csvToStringVecVec(csvPath,
true);
35 vector<string> csvHeader = data[0];
36 data.erase(data.begin());
38 for (vector<string> csvRow:data) {
39 for (
int i = 0; i < csvRow.size() - 1; i++)
40 lookupRows.push_back(csvHeader[i] +
'+' + csvRow[i]);
42 lookupColumns.push_back(csvHeader[csvHeader.size() - 1] +
'+' 43 + csvRow[csvHeader.size() - 1]);
46 sort(lookupRows.begin(), lookupRows.end());
47 sort(lookupColumns.begin(), lookupColumns.end());
48 lookupRows.erase(unique(lookupRows.begin(), lookupRows.end()), lookupRows.end());
49 lookupColumns.erase(unique(lookupColumns.begin(), lookupColumns.end()), lookupColumns.end());
51 lookupTable = MatrixI::zeros(lookupRows.size(),
52 lookupColumns.size());
53 yFrequency = MatrixI::zeros(lookupColumns.size(), 1);
55 for (vector<string> csvRow:data) {
56 for (
int i = 0; i < csvRow.size() - 1; i++) {
57 string rowElement = csvHeader[i] +
'+' + csvRow[i],
58 relevantHeader = csvHeader[csvHeader.size() - 1] +
'+' + csvRow[csvHeader.size() - 1];
59 size_t row =
static_cast<size_t>(distance(lookupRows.begin(),
60 find(lookupRows.begin(), lookupRows.end(), rowElement)));
61 size_t col =
static_cast<size_t>(distance(lookupColumns.begin(),
62 find(lookupColumns.begin(), lookupColumns.end(), relevantHeader)));
64 lookupTable(row, col) += 1;
65 yFrequency(col, 0) += 1;
70 cout <<
"Lookup table:" << endl << lookupTable << endl <<
"Rows:" << endl;
71 for (
auto s : lookupRows)
73 cout << endl <<
"Columns:" << endl;
75 for (
auto s : lookupColumns)
79 cout <<
"Class frequency:" << endl << yFrequency;
90 vector<string>
predict(vector<vector<string>> data,
bool verbose =
true) {
91 vector<string> csvHeader = data[0];
92 data.erase(data.begin());
93 vector<string> result(data.size());
94 MatrixD probabilities = MatrixD::ones(data.size(), lookupColumns.size());
97 #pragma omp parallel for if(data.size() > 500) 98 for (
size_t i = 0; i < data.size(); i++) {
99 vector<string> csvRow = data[i];
102 for (
size_t j = 0; j < csvRow.size(); j++) {
103 string rowElement = csvHeader[j] +
'+' + csvRow[j];
104 size_t row =
static_cast<size_t>(distance(lookupRows.begin(),
105 find(lookupRows.begin(), lookupRows.end(), rowElement)));
108 for (
size_t col = 0; col < lookupColumns.size(); col++) {
109 int lookup = lookupTable(row, col), yFreq = yFrequency(col, 0);
110 double currentFrequency = (double) lookupTable(row, col) / yFrequency(col, 0);
111 probabilities(i, col) *= currentFrequency;
115 int maxProbIndex = -1;
116 double currentMaxProb = 0, probSum = 0;
117 for (
size_t j = 0; j < lookupColumns.size(); j++) {
118 probSum += probabilities(i, j);
119 if (probabilities(i, j) > currentMaxProb) {
120 currentMaxProb = probabilities(i, j);
121 maxProbIndex =
static_cast<int>(j);
126 for (
size_t j = 0; j < lookupColumns.size(); j++)
127 probabilities(i, j) /= probSum;
129 result[i] = maxProbIndex != -1 ? lookupColumns[maxProbIndex] :
"NaN";
133 cout <<
"Probabilities:" << endl << probabilities;
139 #endif //MACHINE_LEARNING_NAIVEBAYES_HPP vector< string > lookupRows
NaiveBayes(const string &csvPath, bool verbose=true)
Naive Bayes classifier.
k-nearest neighbors algorithm, able to do regression and classification
vector< string > predict(vector< vector< string >> data, bool verbose=true)
Predict the classes of new data.