Commit e0faac3a authored by ehebrard's avatar ehebrard
Browse files

update

parent 06a08b72
......@@ -47,15 +47,50 @@ int run_algorithm(DTOptions &opt) {
read_non_binary(input, opt);
}
// vector<size_t> subset;
WeightedDataset<E_t> *test_set = new WeightedDataset<E_t>();
WeightedDataset<E_t> *training_set = new WeightedDataset<E_t>();
if (opt.test_sample != 0) {
std::vector<int>::iterator endx[2] = {input.examples[0].bbegin(),
input.examples[1].bbegin()};
input.drawSample(opt.test_sample, *training_set, *test_set, opt.seed);
if (opt.sample_only) {
for (auto y{0}; y < 2; ++y) {
cout << y << " " << (endx[y] - input.examples[y].bbegin());
for (auto x{input.examples[y].bbegin()}; x != endx[y]; ++x) {
cout << " " << *x;
}
cout << endl;
// cout << input.examples[y] << endl;
}
// return 0;
}
// cout << *training_set << endl;
// cout << *test_set << endl;
} else {
training_set = &input;
}
// cout << training_set << endl;
// cout << input.example_count() << endl;
if (opt.verbosity >= DTOptions::NORMAL)
cout << "d readtime=" << cpu_time() << endl;
////// PREPROCESING
if (opt.preprocessing)
input.preprocess(opt.verbosity >= DTOptions::NORMAL);
if (opt.preprocessing) {
training_set->preprocess(opt.verbosity >= DTOptions::NORMAL);
}
////// CREATING THE ALGORITHM
BacktrackingAlgorithm<ErrorPolicy, E_t> A(input, opt);
BacktrackingAlgorithm<ErrorPolicy, E_t> A(*training_set, opt);
if (opt.verbosity >= DTOptions::NORMAL)
cout << "d inputtime=" << cpu_time() << endl;
......@@ -78,7 +113,7 @@ int run_algorithm(DTOptions &opt) {
if (opt.minsize)
A.set_size_objective();
A.minimize_error();
}
}
Tree<E_t> sol = A.getSolution();
......@@ -86,7 +121,7 @@ int run_algorithm(DTOptions &opt) {
E_t tree_error = 0;
for (auto y{0}; y < 2; ++y) {
auto X{input[y]};
auto X{(*training_set)[y]};
for (auto i : X)
tree_error += (sol.predict(X[i]) != y) * X.weight(i);
}
......@@ -100,47 +135,76 @@ int run_algorithm(DTOptions &opt) {
if (opt.pruning) {
cout << "p post-pruning (additional error up to " << opt.pruning << ")\n";
// cout << sol << endl;
size_t total[2] = {input.total(0), input.total(1)};
E_t limit{static_cast<E_t>(opt.pruning) - A.error() - input.numInconsistent()};
if(limit < 0)
limit = 0;
cout << "p post-pruning (additional error up to " << opt.pruning << ")\n";
size_t total[2] = {training_set->total(0), training_set->total(1)};
E_t limit{static_cast<E_t>(opt.pruning) - A.error() -
training_set->numInconsistent()};
if (limit < 0)
limit = 0;
sol.prune(total, limit, false);
E_t tree_error = 0;
for (auto y{0}; y < 2; ++y) {
auto X{input[y]};
auto X{(*training_set)[y]};
for (auto i : X)
tree_error += (sol.predict(X[i]) != y) * X.weight(i);
}
// double t{cpu_time() - start_time};
double accuracy{1.0 - static_cast<double>(tree_error + input.numInconsistent())/static_cast<double>(input.input_example_count())};
// double t{cpu_time() - start_time};
cout << left << "d accuracy=" << setw(6) << setprecision(4)
<< fixedwidthfloat(accuracy, 4) << " error=" << setw(4)
<< tree_error + input.numInconsistent() << " depth=" << setw(3) << sol.depth()
<< " size=" << setw(3) << sol.size()
// << " time=" << setprecision(max(4, static_cast<int>(log10(t))))
// << fixedwidthfloat(t, 3) << right
<< endl;
double accuracy{
1.0 -
static_cast<double>(tree_error + training_set->numInconsistent()) /
static_cast<double>(training_set->input_example_count())};
cout << left << "d accuracy=" << setw(6) << setprecision(4)
<< fixedwidthfloat(accuracy, 4) << " error=" << setw(4)
<< tree_error + training_set->numInconsistent() << " depth=" << setw(3)
<< sol.depth() << " size=" << setw(3) << sol.size()
// << " time=" << setprecision(max(4, static_cast<int>(log10(t))))
// << fixedwidthfloat(t, 3) << right
<< endl;
// cout << "after pruning: " << tree_error << endl;
// cout << sol.size() << " " << sol.depth() << endl;
}
if (opt.tree_file != "") {
ofstream treefile(opt.tree_file, ios_base::out);
// treefile << A << endl;
treefile << sol << endl;
}
if (opt.test_sample != 0) {
E_t tree_error = 0;
for (auto y{0}; y < 2; ++y) {
auto X{(*test_set)[y]};
for (auto i : X) {
assert(X.weight(i) == 1);
tree_error += (sol.predict(X[i]) != y) * X.weight(i);
}
}
// assert(tree_error == A.error());
cout << std::setprecision(std::numeric_limits<long double>::digits10 + 1)
<< std::setw(0) << "d test_error=" << tree_error
<< " test_accuracy=" << setprecision(7)
<< 1.0 -
static_cast<double>(tree_error) /
static_cast<double>(test_set->example_count())
<< endl;
}
if (opt.print_sol) {
cout << sol << endl;
}
return 1;
return 0;
}
......
......@@ -18,6 +18,8 @@ public:
// the actual options
string cmdline; // for reference
string instance_file;
string tree_file;
// string test;
string debug;
string output;
string format;
......@@ -35,7 +37,7 @@ public:
bool verified;
double sample;
double test_sample;
int width;
double focus;
......@@ -79,25 +81,27 @@ public:
double pruning;
bool sample_only;
DTOptions(){};
DTOptions(const DTOptions &opt)
: cmdline(opt.cmdline), instance_file(opt.instance_file),
debug(opt.debug), output(opt.output), format(opt.format),
verbosity(opt.verbosity), seed(opt.seed), print_sol(opt.print_sol),
print_par(opt.print_par), print_ins(opt.print_ins),
print_sta(opt.print_sta), print_cmd(opt.print_cmd),
verified(opt.verified), sample(opt.sample), width(opt.width),
focus(opt.focus), max_depth(opt.max_depth),
restart_base(opt.restart_base), restart_factor(opt.restart_factor),
time(opt.time), search(opt.search), bounding(opt.bounding),
node_strategy(opt.node_strategy),
tree_file(opt.tree_file), debug(opt.debug), output(opt.output),
format(opt.format), verbosity(opt.verbosity), seed(opt.seed),
print_sol(opt.print_sol), print_par(opt.print_par),
print_ins(opt.print_ins), print_sta(opt.print_sta),
print_cmd(opt.print_cmd), verified(opt.verified),
test_sample(opt.test_sample), width(opt.width), focus(opt.focus),
max_depth(opt.max_depth), restart_base(opt.restart_base),
restart_factor(opt.restart_factor), time(opt.time), search(opt.search),
bounding(opt.bounding), node_strategy(opt.node_strategy),
feature_strategy(opt.feature_strategy), split(opt.split),
ada_it(opt.ada_it), ada_stop(opt.ada_stop), filter(opt.filter),
reference_class(opt.reference_class), mindepth(opt.mindepth),
minsize(opt.minsize), preprocessing(opt.preprocessing),
progress(opt.progress), delimiter(opt.delimiter),
intarget(opt.intarget), outtarget(opt.outtarget), pruning(opt.pruning) {
}
intarget(opt.intarget), outtarget(opt.outtarget), pruning(opt.pruning),
sample_only(opt.sample_only) {}
ostream &display(ostream &os);
};
......
......@@ -26,14 +26,16 @@ public:
void addExample(const vector<int> &x);
void addBitsetExample(instance &x, const bool y);
void rmExample(const bool y, const int idx);
void addBitsetExample(instance &x, const bool y, const E_t w = 1);
// template <class Algo> void toInc(Algo &algo);
// template <class Algo> void setup(Algo &algo) const;
void preprocess(const bool verbose = false);
// randomly select ratio * count(c) examples from classes c in {0,1}
void sample(const double ratio, const long seed=12345);
// // randomly select ratio * count(c) examples from classes c in {0,1}
// void sample(const double ratio, const long seed = 12345);
size_t input_count(const bool c) const { return data[c].size(); }
size_t input_example_count() const { return input_count(0) + input_count(1); }
......@@ -49,30 +51,13 @@ public:
template <class selector>
void printDatasetToFile(ostream &outfile, const string &delimiter,
const string &endline, selector not_redundant,
const bool first = true,
const bool header = false) const;
const bool first = true, const bool header = false,
const bool weighted = false) const;
template <class selector>
void printHeader(ostream &outfile, const string &delimiter,
const string &endline, const string &label,
selector not_redundant, const bool first = true) const;
// void printDatasetToTextFile(ostream &outfile, const bool first =
// true)
// const;
// template <class selector>
// void printDatasetToTextFile(ostream &outfile, selector s,
// const bool first) const;
// void printDatasetToCSVFile(ostream &outfile, const string &delimiter
// = ",",
// const bool first = false) const;
// template <class selector>
// void printDatasetToCSVFile(ostream &outfile, const string &delimiter
// = ",",
// const bool first = false) const;
class List {
public:
......@@ -106,23 +91,39 @@ public:
size_t numInconsistent() const { return suppression_count; }
// // remove datapoint in indices and add them to subset
// template <class Container>
// void split(WeightedDataset<E_t> &subset, Container &choice);
void drawSample(const double ratio, WeightedDataset<E_t> &training,
WeightedDataset<E_t> &test, const long seed = 12345);
private:
vector<instance> data[2];
vector<E_t> weight[2];
public:
SparseSet examples[2];
private:
// vector<pair<bool, size_t>> exlog;
E_t total_weight[2]{0, 0};
size_t suppression_count{0};
};
template <typename E_t>
void WeightedDataset<E_t>::addBitsetExample(instance &x, const bool y) {
void WeightedDataset<E_t>::addBitsetExample(instance &x, const bool y,
const E_t w) {
// exlog.push_back({y, data[y].size()});
data[y].push_back(x);
examples[y].reserve(data[y].capacity());
examples[y].add(data[y].size()-1);
weight[y].push_back(1);
examples[y].reserve(data[y].capacity());
examples[y].add(data[y].size() - 1);
weight[y].push_back(w);
++total_weight[y];
}
......@@ -131,6 +132,89 @@ inline void WeightedDataset<E_t>::addExample(const vector<int> &example) {
return addExample(example.begin(), example.end(), -1, 1);
}
// template <typename E_t>
// template <class Container>
// void WeightedDataset<E_t>::split(WeightedDataset<E_t> &subset,
// Container &tests) {
// // for (auto y{0}; y < 2; ++y) {
// // for (auto i : indices[y]) {
// // auto x{examples[y][i]};
// // subset.addBitsetExample(data[y][x], y, weight[y][x]);
// // total_weight -= weight[y][x];
// // examples[y].remove_back(x);
// // }
// // }
// for (auto t : tests) {
// auto y{exlog[t].first};
// auto x{exlog[t].second};
// subset.addBitsetExample(data[y][x], y, weight[y][x]);
// total_weight[y] -= weight[y][x];
// examples[y].remove_back(x);
// }
// }
template <typename E_t>
void WeightedDataset<E_t>::rmExample(const bool y, const int x) {
auto cur_pos{examples[y].index(x)};
auto z{examples[y].back()};
examples[y].remove_back(x);
assert(examples[y][cur_pos] == z);
// cout << "replace " << x << " by " << z << endl;
// // cout << data[y][cur_pos] << endl;
// // cout << data[y][z] << endl;
//
//
// assert(data[y].size() > x);
// assert(data[y].size() > z);
// data[y][x] = data[y][z];
// data[y].pop_back();
}
template <typename E_t>
void WeightedDataset<E_t>::drawSample(const double ratio,
WeightedDataset<E_t> &training,
WeightedDataset<E_t> &test,
const long seed) {
mt19937 random_generator;
random_generator.seed(seed);
for (auto y{0}; y < 2; ++y) {
size_t target{
static_cast<size_t>(static_cast<double>(count(y)) * (1.0 - ratio))};
// cout << target << " / " << count(y) << endl;
// auto last{examples[y].bbegin()};
while (count(y) > target) {
auto i{random_generator() % count(y)};
auto x{examples[y][i]};
// cout << x << " -> test" << endl;
// cout //<< " " << i
// << " " << x ;
test.addBitsetExample(data[y][x], y, weight[y][x]);
// total_weight[y] -= weight[y][x];
// // examples[y].remove_back(x);
//
rmExample(y, x);
// cout << (count(y) - target) << endl;
}
// cout << endl;
for (auto x : examples[y]) {
training.addBitsetExample(data[y][x], y, weight[y][x]);
}
// cout << examples[y] << endl;
}
}
template <typename E_t>
template <class rIter>
inline void WeightedDataset<E_t>::addExample(rIter beg_row, rIter end_row,
......@@ -140,6 +224,8 @@ inline void WeightedDataset<E_t>::addExample(rIter beg_row, rIter end_row,
auto column{(width + target) % width};
auto y{*(beg_row + column)};
// exlog.push_back({y, data[y].size()});
++total_weight[y];
if (data[y].size() == data[y].capacity()) {
......@@ -155,8 +241,8 @@ inline void WeightedDataset<E_t>::addExample(rIter beg_row, rIter end_row,
int f{0};
for (auto x{beg_row}; x != end_row; ++x) {
// assert(*x == 0 or *x == 1);
if (*x != 0 and *x != 1)
throw 0;
if (*x != 0 and *x != 1)
throw 0;
if (x - beg_row != column) {
if (*x)
data[y].back().set(f);
......@@ -179,18 +265,20 @@ inline void WeightedDataset<E_t>::addExample(rIter beg_row, rIter end_row,
// algo.setErrorOffset(suppression_count);
// }
template <typename E_t> void WeightedDataset<E_t>::sample(const double ratio, const long seed) {
mt19937 random_generator;
random_generator.seed(seed);
for(auto y{0}; y<2; ++y) {
size_t target{static_cast<size_t>(static_cast<double>(count(y)) * ratio)};
while(count(y) > target) {
auto i{random_generator() % count(y)};
examples[y].remove_back(examples[y][i]);
}
}
}
// template <typename E_t> void WeightedDataset<E_t>::sample(const double ratio,
// const long seed) {
// mt19937 random_generator;
// random_generator.seed(seed);
//
// for (auto y{0}; y < 2; ++y) {
// size_t target{static_cast<size_t>(static_cast<double>(count(y)) *
// ratio)};
// while (count(y) > target) {
// auto i{random_generator() % count(y)};
// examples[y].remove_back(examples[y][i]);
// }
// }
// }
template <typename E_t> void WeightedDataset<E_t>::preprocess(const bool verbose) {
......@@ -269,10 +357,10 @@ template <typename E_t> void WeightedDataset<E_t>::preprocess(const bool verbose
// assert(i[y] < weight[y].size());
if(x[y] != end[y])
wght[y] = weight[y][i[y]];
else
wght[y] = 0;
if (x[y] != end[y])
wght[y] = weight[y][i[y]];
else
wght[y] = 0;
}
}
}
......@@ -347,6 +435,174 @@ template <typename E_t> void WeightedDataset<E_t>::preprocess(const bool verbose
// // cout << suppression_count << endl;
}
// template <typename E_t> void WeightedDataset<E_t>::preprocess(const bool
// verbose) {
//
// auto t{cpu_time()};
//
// suppression_count = 0;
// // unsigned long dup_count = 0; // for statistics
//
// for (int y = 0; y < 2; ++y) {
// std::sort(examples[y].begin(), examples[y].end(), [&](const int i, const
// int j) {return data[y][i] <= data[y][j]});
// }
//
// if (verbose)
// cout << "d sorttime=" << cpu_time() - t << endl;
//
// vector<instance>::iterator x[2] = {data[0].begin(), data[1].begin()};
// vector<instance>::iterator end[2] = {data[0].end(), data[1].end()};
//
// // int wght[2] = {1, 1};
//
// int i[2] = {0, 0};
// E_t wght[2] = {weight[0][i[0]], weight[1][i[1]]};
//
// // cout << endl << setw(3) << data[0].size() << " " << setw(3) <<
// // data[1].size() << endl;
//
// while (x[0] != end[0] and x[1] != end[1]) {
//
// // cout << endl << setw(3) << i[0] << " " << setw(3) << i[1] << endl;
//
// for (int y = 0; y < 2; ++y)
// while (x[y] != (end[y] - 1) and *(x[y]) == *(x[y] + 1)) {
// // cout << "remove (" << y << ") " << i[y] << endl;
// examples[y].remove_back(i[y]);
// ++x[y];
// ++i[y];
// wght[y] += weight[y][i[y]];
// }
//
// // cout << setw(3) << i[0] << " " << setw(3) << i[1] << endl;
//
// if (*x[0] < *x[1]) {
// weight[0][i[0]] = wght[0];
// ++x[0];
// ++i[0];
// wght[0] = weight[0][i[0]];
// } else if (*x[0] > *x[1]) {
// weight[1][i[1]] = wght[1];
// ++x[1];
// ++i[1];
// wght[1] = weight[1][i[1]];
// } else {
// if (wght[0] < wght[1]) {
// weight[1][i[1]] = wght[1] - wght[0];
//
// // cout << "remove0 " << i[0] << endl;
// examples[0].remove_back(i[0]);
// suppression_count += wght[0];
// } else if (wght[0] > wght[1]) {
// weight[0][i[0]] = wght[0] - wght[1];
//
// // cout << "remove1 " << i[1] << endl;
// examples[1].remove_back(i[1]);
// suppression_count += wght[1];
// } else {
// suppression_count += wght[1];
//
// // cout << "remove " << i[0] << " and " << i[1] << endl;
//
// examples[0].remove_back(i[0]);
// examples[1].remove_back(i[1]);
// }
// for (int y = 0; y < 2; ++y) {
// ++x[y];
// ++i[y];
//
// // assert(i[y] < weight[y].size());
//
// if (x[y] != end[y])
// wght[y] = weight[y][i[y]];
// else
// wght[y] = 0;
// }
// }
// }
//
// for (int y = 0; y < 2; ++y) {
//
// // cout << "wght[y]: " << wght[y] << endl;
//
// wght[y] = 0;
//
// for (; x[y] != end[y]; ++x[y]) {
// assert(x[1 - y] == end[1 - y]);
//
// wght[y] += weight[y][i[y]];
//
// if (x[y] == end[y] - 1 or *x[y] != *(x[y] + 1)) {
// weight[y][i[y]] = wght[y];
// wght[y] = 0;
// } else {
//
// // cout << "remove end " << i[y] << endl;
//
// examples[y].remove_back(i[y]);
// }
// ++i[y];
// }
// }
//
// auto dup_count{input_count(0) + input_count(1) - count(0) - count(1) -
// 2 * suppression_count};
// if (verbose)
// std::cout << "d duplicate=" << dup_count
// << " suppressed=" << suppression_count << " ratio="
// << float(dup_count + 2 * suppression_count) /
// input_example_count()
// << " count=" << input_example_count() << " negative=" <<
// count(0)
// << " positive=" << count(1) << " final_count=" <<
// example_count()
// << "\nd preprocesstime=" << cpu_time() - t << endl;
//
// for (auto i{0}; i < 2; ++i)
// total_weight[i] -= suppression_count;