Commit e37c1a88 authored by ehebrard's avatar ehebrard
Browse files

rm stuff

parent f1061267
*.so
*_wrap.cxx
*.o
*.pyc
__pycache__
budFirstSearch.py
\ No newline at end of file
from .bud_first_search import *
from .adaboost import *
from .wrapper import DTOptions, Wood, Tree, BacktrackingAlgo, Adaboost
from .wrapper import parse, read_binary
import sys
import bud_first_search as bfs
opt = bfs.parse(bfs.to_str_vec(sys.argv))
filename = str(opt.instance_file)
wood = bfs.Wood()
algo = bfs.BacktrackingAlgo(wood, opt)
if filename == "":
print("No input file!")
sys.exit(-1)
else:
bfs.read_binary(algo, opt)
algo.minimize_error()
tree = algo.getSolution()
nodes, edges = bfs.read_tree(tree)
print("nodes:", nodes,"\n\nedges: ", edges)
from . import wrapper
from .utils import *
class AdaBoostClassifier:
def __init__(self):
self.opt = wrapper.parse(to_str_vec(["adaboost.py", "file"]))
def fit(self, X, Y):
self.algo = wrapper.Adaboost(self.opt)
for x, y in zip(X, Y):
self.algo.addExample(x + [y])
self.algo.train()
def predict(self, X):
Y = []
for x in X:
Y.append(self.algo.predict(x))
return Y
from . import wrapper
from .utils import *
import copy
import numpy as np
from enum import IntEnum
# utility enums from CmdLine.hpp
class Verbosity(IntEnum):
SILENT=0
QUIET=1
NORMAL=2
YACKING=3
SOLVERINFO=4
class NodeStrategy(IntEnum):
FIRST=0
RANDOM=1
ERROR=2
ERROR_REDUCTION=3
class FeatureStrategy(IntEnum):
MIN_ERROR=0
ENTROPY=1
GINI=2
HYBRID=3
def read_tree(tree):
nodes = []
edges = []
def add_node(tree, node):
if (node <= 1):
nodes.append({"leaf": True, "feat": node})
else:
id = len(nodes)
nodes.append({"leaf": False, "feat": tree.getFeature(node)})
# Add edge 1
node0 = tree.getChild(node, 0)
edges.append({"parent": id, "child": len(nodes), "val": 0})
add_node(tree, node0)
# Add edge 2
node1 = tree.getChild(node, 1)
edges.append({"parent": id, "child": len(nodes), "val": 1})
add_node(tree, node1)
add_node(tree, tree.idx)
return nodes, edges
class BudFirstSearchClassifier:
"""
Scikit learn-compatible estimator to use BudFirstSearch with Scikit Learn
meta-algorithms (like Adaboost)
"""
def __init__(self, cmd_line_args = [], **kwargs):
self.args = ["bud_first_search.py", "--file", ""] + cmd_line_args
self.opt = wrapper.parse(to_str_vec(self.args))
for key in kwargs:
setattr(self.opt, key, kwargs[key])
"""
# Mimics the sanity check of Scikit learn to understand why it does not pass
params = self.get_params()
for name in kwargs:
if params[name] is kwargs[name]:
print(type(params[name]), params[name], "is", type(kwargs[name]), kwargs[name])
else:
print(type(params[name]), params[name], "is not", type(kwargs[name]), kwargs[name])
"""
# When the classifier is cloned by AdaBoost, there is a check to verify if parameters
# are the same on the original and the clone. The test uses the keyword "is" as above.
# For some values, (eg floats or big integers) the check does not pass because the
# values are taken from the C++ DTOptions object.
# If we return kwargs as the parameter dict, we pass the sanity check.
self.params = { key : kwargs[key] for key in self.get_param_names() } if len(kwargs) != 0 else None
self.tree = None
self.nodes = []
self.edges = []
self.classes_ = np.array([0, 1])
self.n_classes_ = 2
def get_param_names(self):
# TODO add useful parameters
return {"max_depth", "time", "search", "seed", "mindepth", "minsize"}
def get_params(self, deep = False):
params = { key : getattr(self.opt, key) for key in self.get_param_names() }
# if we were cloned: pass the sanity check
if self.params == params:
return self.params
else:
self.params = None
return params
def set_params(self, **kwargs):
for key in kwargs:
if hasattr(self.opt, key):
setattr(self.opt, key, kwargs[key])
def _binarize_data(self, X, Y):
"""
Binarize data (only if necessary).
Does not support multiclass.
This methods just operates a translation.
"""
vals = set()
for x, y in zip(X, Y):
vals.update(x)
vals.update([y])
if len(vals) != 2:
print("Classification is not binary!")
if vals == {0, 1}:
Xb = X
Yb = Y
else:
Xb = copy.deepcopy(X)
Yb = copy.deepcopy(Y)
for old, new in zip(vals, {0, 1}):
print(old, new)
for i in range(len(X)):
for j in range(len(X[i])):
if X[i][j] == old:
Xb[i][j] = new
if Y[i] == old:
Yb[i] = new
return Xb, Yb
def fit(self, X, Y, sample_weight=None):
self.wood = wrapper.Wood()
Xb, Yb = X, Y # self._binarize_data(X, Y)
if sample_weight is not None:
self.algo = wrapper.WeightedBacktrackingAlgod(self.wood, self.opt)
for x, y, w in zip(Xb, Yb, sample_weight):
# scikit learn classes start at 1
self.algo.addExample(to_int_vec(list(x) + [y]), w)
else:
self.algo = wrapper.BacktrackingAlgo(self.wood, self.opt)
for x, y in zip(Xb, Yb):
# scikit learn classes & features start at 1
self.algo.addExample(to_int_vec(list(x) + [y]))
if self.opt.mindepth:
if self.opt.minsize:
self.algo.minimize_error_depth_size()
else:
self.algo.minimize_error_depth()
else:
self.algo.minimize_error()
self.tree = self.algo.getSolution()
self.nodes, self.edges = read_tree(self.tree)
# free memory
del self.algo
del self.wood
def predict(self, X):
if not self.tree:
raise ValueError("please call fit before predict!")
Y = []
for x in X:
node_id = 0
node = self.nodes[node_id]
while not node["leaf"]:
val = x[node["feat"]]
node_id = [e["child"] for e in self.edges if e["parent"] == node_id and e["val"] == val][0]
node = self.nodes[node_id]
Y.append(node["feat"])
return np.array(Y)
def correct_count(self, samples):
"""
Returns the number of examples that are correctly classified
"""
correct_count = 0
for sample in samples:
y_pred = self.predict(sample[:-1])
if y_pred == sample[-1]:
correct_count += 1
return correct_count
TEST_SAMPLE = [[1, 0, 1], [1, 1, 0], [0, 1, 1], [0, 0, 0]]
include ../boost_home
WRAPRDIR ?= .
MAINDIR ?= ..
CCC = g++
SRC=$(WRAPRDIR)/wrapper/src
OBJ=$(WRAPRDIR)/wrapper/obj
INC=$(WRAPRDIR)/wrapper/include
SWIG=$(WRAPRDIR)/wrapper/swig
MAININC=$(MAINDIR)/src/include
PYFLAGS = `python3-config --cflags`
CFLAGS = -std=c++11 -fPIC -I$(INC) -I$(MAININC) -I$(BOOSTDIR) -I../tools $(PYFLAGS) # -ffloat-store
LFLAGS = -L$(OBJ) -flto
PWRAPSRC = $(wildcard $(SRC)/*.cpp)
PWRAPSWIG = $(wildcard $(SWIG)/*.i)
PWRAPAUX = $(PWRAPSRC:.cpp=.o)
PWRAPSWIGAUX = $(PWRAPSWIG:.i=_wrap.o)
PWRAPOBJ = $(patsubst $(SRC)/%, $(OBJ)/%, $(PWRAPAUX))
PWRAPSWIGOBJ = $(patsubst $(SWIG)/%, $(OBJ)/%, $(PWRAPSWIGAUX))
PLIBOBJ=$(wildcard $(MAINDIR)/src/obj/*.o)
wrapper/_budFirstSearch.so: $(PWRAPOBJ) $(PWRAPSWIGOBJ)
$(CCC) -std=c++11 -fPIC -shared $(PWRAPOBJ) $(PWRAPSWIGOBJ) $(PLIBOBJ) -o wrapper/_budFirstSearch.so
$(OBJ)/budFirstSearch.o: $(SRC)/budFirstSearch.cpp
$(CCC) $(CFLAGS) -c $(SRC)/budFirstSearch.cpp -o $@
$(OBJ)/budFirstSearch_wrap.o: $(SWIG)/budFirstSearch_wrap.cxx
$(CCC) $(CFLAGS) -c $(SWIG)/budFirstSearch_wrap.cxx -o $@
$(SWIG)/budFirstSearch_wrap.cxx: $(SWIG)/budFirstSearch.i
swig -c++ -python -py3 -outdir wrapper $(SWIG)/budFirstSearch.i
clean:
rm -rf $(PWRAPOBJ) $(PWRAPSWIGOBJ) $(SWIG)/*_wrap.cxx wrapper/*.so wrapper/budFirstSearch.py
from . import wrapper
def to_str_vec(str_list):
vec = wrapper.str_vec(len(str_list))
for i in range(len(str_list)):
vec[i] = str_list[i]
return vec
def to_int_vec(sample):
vec = wrapper.int_vec(len(sample))
for i in range(len(sample)):
vec[i] = int(sample[i])
return vec
from . import _budFirstSearch
from .budFirstSearch import *
\ No newline at end of file
#include <vector>
#include <string>
#include "Tree.hpp"
#include "Backtrack.hpp"
#include "CmdLine.hpp"
#include "Adaboost.hpp"
#define SWIG_FILE_WITH_INIT
extern DTOptions parse(std::vector<std::string> params);
extern void read_binary(primer::BacktrackingAlgorithm<> &A, DTOptions &opt);
#include "budFirstSearch.h"
#include <iostream>
#include "CSVReader.hpp"
#include "TXTReader.hpp"
using namespace primer;
/*
void addNode(Tree &tree, int node, Results &res) {
if (node <= 1) {
// add node
res.nodes.push_back({true, node});
}
else {
int id = res.nodes.size();
res.nodes.push_back({false, tree.getFeature(node)});
// add edge 1
int node0 = tree.getChild(node, 0);
res.edges.push_back({id, res.nodes.size(), 0});
addNode(tree, node0, res);
// add edge 2
int node1 = tree.getChild(node, 1);
res.edges.push_back({id, res.nodes.size(), 1});
addNode(tree, node1, res);
}
}
*/
DTOptions parse(std::vector<std::string> params) {
std::vector<char*> cparams;
for (auto &param : params) {
cparams.push_back(const_cast<char*>(param.c_str()));
}
return parse_dt(cparams.size(), &cparams[0]);
}
void read_binary(BacktrackingAlgorithm<> &A, DTOptions &opt) {
string ext{opt.instance_file.substr(opt.instance_file.find_last_of(".") + 1)};
if (opt.format == "csv" or (opt.format == "guess" and ext == "csv")) {
csv::read_binary(opt.instance_file, [&](vector<int> &data) {
A.addExample(data.begin(), data.end() - 1, data.back());
});
} else if (opt.format == "dl8" or (opt.format == "guess" and ext == "dl8")) {
txt::read_binary(opt.instance_file, [&](vector<int> &data) {
auto y = *data.begin();
A.addExample(data.begin() + 1, data.end(), y);
});
} else {
if (opt.format != "txt" and ext != "txt")
cout << "p Warning, unrecognized format, trying txt\n";
txt::read_binary(opt.instance_file, [&](vector<int> &data) {
A.addExample(data.begin(), data.end() - 1, data.back());
});
}
}
%module budFirstSearch
%include "std_vector.i"
%include "std_string.i"
%{
#include "budFirstSearch.h"
%}
extern DTOptions parse(std::vector<std::string> params);
extern void read_binary(primer::BacktrackingAlgorithm<CardinalityError, int> &A, DTOptions &opt);
namespace std {
%template(int_vec) vector<int>;
%template(cstr_vec) vector<char*>;
%template(str_vec) vector<string>;
};
// DTOptions
class DTOptions {
public:
std::string instance_file;
std::string debug;
std::string output;
std::string format;
int verbosity;
int seed;
bool print_sol;
bool print_par;
bool print_ins;
bool print_sta;
bool print_cmd;
bool verified;
double sample;
int width;
double focus;
int max_depth;
int restart_base;
double restart_factor;
bool filter;
double time;
int search;
bool bounding;
int node_strategy;
int feature_strategy;
bool binarize;
int ada_it;
int ada_stop;
bool mindepth;
bool minsize;
bool preprocessing;
DTOptions();
};
namespace primer {
// Tree
class Wood {
public:
Wood();
};
class Tree {
public:
int idx;
Tree() = delete;
Tree(Wood*, int i);
int getChild(int node, int branch);
int getFeature(int node);
};
// BacktrackingAlgorithm
template <template<typename> class ErrorPolicy, typename E_t>
class BacktrackingAlgorithm {
public:
BacktrackingAlgorithm() = delete;
BacktrackingAlgorithm(Wood &w, DTOptions &o);
void minimize_error();
void minimize_error_depth();
void minimize_error_depth_size();
Tree getSolution();
void addExample(const std::vector<int> &example);
void addExample(const std::vector<int> &example, E_t weight);
};
class Adaboost {
public:
DTOptions &options;
Adaboost() = delete;
Adaboost(DTOptions &opt);
void train();
bool predict(const std::vector<int> &example) const;
void addExample(const std::vector<int> &example);
};
template <class ErrorType> class CardinalityError;
template <class ErrorType> class WeightedError;
%template(BacktrackingAlgo) BacktrackingAlgorithm<CardinalityError, int>;
%template(WeightedBacktrackingAlgo) BacktrackingAlgorithm<WeightedError, int>;
%template(WeightedBacktrackingAlgod) BacktrackingAlgorithm<WeightedError, double>;
}
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment