---
title: "Simple decision trees"
date: "2015-10-02"
---


**Update** (2016-02-09): It has been brought to my attention that one of the decision trees is not being generated correctly so please don't use these diagrams. I'm too lazy to debug this code though.

Decision trees are commonly used in machine learning because they work surprisingly well. A simple algorithm to make a decision tree is ID3. We choose the feature $X$ that maximizes information gain $IG(X)$, defined as:

$$
IG(X)\equiv H(Y) - H(Y|X)
$$

where $H$ is the entropy given by 

$$
H(X) = \sum_i -p(X=i) \log p(X=i)
$$

and the conditional entropy is given by

$$
H(Y|X) = \sum_i  p(X=i) H(Y|X=i)
$$

Type the data here (space delimited; first column is the output).

<style>
svg {
    display: block;
    margin: 0 auto;
    max-width: 100%;
}
</style>
<textarea rows="1" cols="80" id="mlfeatures">outlook temp. hum. wind</textarea>
<textarea rows="14" cols="80" id="mldata">no sun 26 high low
no sun 25 high high
yes overcast 25 high low
yes rain 24 high low
yes rain 19 normal low
no rain 20 normal high
yes overcast 20 normal high
no sun 23 high low
yes sun 20 normal low
yes rain 25 normal low
yes sun 24 normal high
yes overcast 22 high high
yes overcast 23 normal low
no rain 23 high high</textarea>

<button id="treebutton">Go!</button>

# ID3 Tree

<div id="id3"></div>

# C4.5 Tree

<div id="c45"></div>

# SVG diagram code

This is provided here so that you can save the diagrams by copying the following code into your favourite text editor and then saving it as an SVG file.

## ID3

<pre><code id="id3-pre"></code></pre>

## C4.5

<pre><code id="c45-pre"></code></pre>
<script>
var feats = [];
var IG = function(n, A) {
    var p = {};
    for(var i=0; i<n.length; i++) {
        if(p[n[i][0]] === undefined) p[n[i][0]] = 0;
        p[n[i][0]] += 1/n.length;
    }
    var pa = [];
    for(var i=0; i<A.length; i++) {
        pa.push({});
        for(var j=0; j<n.length; j++) {
            if(pa[i][n[j][A[i]]] === undefined) pa[i][n[j][A[i]]] = 0;
            pa[i][n[j][A[i]]] += 1/n.length;
        }
    }
    var HX = 0;
    for(var key in p) {
        HX += -p[key] * Math.log2(p[key]);
    }
    ig = {};
    for(var i=0; i<A.length; i++) {
        var HXY = 0;
        for(var key in pa[i]) {
            var pp = {};
            var nnn = 0;
            for(var j=0; j<n.length; j++) {
                if(n[j][A[i]] !== key) continue;
                if(pp[n[j][0]] === undefined) pp[n[j][0]] = 0;
                pp[n[j][0]] += 1;
                nnn++;
            }
            for(var kkey in pp) {
                pp[kkey] /= nnn;
            }
            var HY = 0;
            for(var kkey in pp) {
                HY += -pp[kkey] * Math.log2(pp[kkey]);
            }
            HXY += pa[i][key] * HY;
        }
        ig[A[i]] = HX - HXY;
    }
    return ig;
}

var splitInfo = function(n, A) {
    var si = {};
    for(var j=0; j<A.length; j++) {
        si[A[j]] = 0;
        var count = 0;
        var zxcv = {};
        for(var i=0; i<n.length; i++) {
            if(zxcv[n[i][A[j]]] === undefined) zxcv[n[i][A[j]]] = 0;
            zxcv[n[i][A[j]]]++;
        }
        for(var key in zxcv) {
            si[A[j]] -= zxcv[key]/n.length * Math.log2(zxcv[key]/n.length);
        }
    }
    return si;
}

var bestAttributeID3 = function(n, A) {
    ig = IG(n, A);
    var best = -1e9, besta = undefined;
    for(var i=0; i<A.length; i++) {
        if(ig[A[i]] > best) {
            best = ig[A[i]];
            besta = A[i];
        }
    }
    return besta;
}

var bestAttributeC45 = function(n, A) {
    ig = IG(n, A);
    si = splitInfo(n, A);
    var best = -1e9, besta = undefined;
    for(var i=0; i<A.length; i++) {
        if(ig[A[i]] > best) {
            best = ig[A[i]]/si[A[i]];
            besta = A[i];
        }
    }
    return besta;
}

function same(a) {
    for(var i=1; i<a.length; i++) {
        if(a[i][0] != a[i-1][0]) return false;
    }
    return true;
}
function BuildTree(n, A, bestAttribute) {
    if(A.length == 0 || same(n)) {
        var c = {};
        for(var i=0; i<n.length; i++) {
            if(c[n[i][0]] === undefined) c[n[i][0]] = 0;
            c[n[i][0]]++;
        }
        var best = 0, bestc = undefined;
        for(var key in c) {
            if(c[key] > best) {
                best = c[key];
                bestc = key;
            }
        }
        return {status:'leaf', class:bestc};
    }
    var a = bestAttribute(n, A);
    var c = {};
    for(var i=0; i<n.length; i++) {
        if(c[n[i][a]] === undefined) c[n[i][a]] = [];
        c[n[i][a]].push(n[i]);
    }
    var children = [];
    for(var key in c) {
        var newA = [];
        for(var j=0; j<A.length; j++) {
            if(A[j] != a) {
                newA.push(A[j]);
            }
        }
        children.push({key:key, subtree:BuildTree(c[key], newA, bestAttribute)});
    }
    return {status:'internal', attribute:feats[a-1], children:children};
}
function getData() {
    getFeat();
    var s = document.getElementById('mldata').value;
    s = s.split('\n');
    for(var i=0; i<s.length; i++) {
        s[i] = s[i].split(' ');
    }
    return s;
}
function getFeat() {
    var s = document.getElementById('mlfeatures').value;
    feats = s.split(' ');
}
var strk = 3;
var wdth = 60;
var mrgn = 5;
var fsz = 15;
function genSVG(tree) {
    if(tree.status === 'leaf') {
        return {width:2*mrgn+wdth, height:wdth, svg:'<rect x="'+mrgn+'" y="0" width="'+wdth+'" height="'+wdth+'" style="fill:#eee;stroke:#000;stroke-linecap:round;stroke-width:'+strk+'"/><text x="'+(mrgn+wdth/2)+'" y="'+(wdth/2+fsz/2)+'" font-size="'+fsz+'" text-anchor="middle">'+tree.class+'</text>'};
    }
    var wid = 0;
    var svg = '';
    var nodules = [];
    var maxheight = 0;
    for(var i=0; i<tree.children.length; i++) {
        var csvg = genSVG(tree.children[i].subtree);
        svg += '<g transform="translate('+wid+','+2*wdth+')"><text x="'+(csvg.width/2)+'" y="'+(wdth/2+fsz/2)+'" font-size="'+fsz+'" text-anchor="middle">'+tree.children[i].key+'</text><g transform="translate(0,'+wdth+')">'+csvg.svg+'</g></g>';
        nodules.push(wid + csvg.width/2);
        wid += csvg.width;
        if(csvg.height > maxheight) maxheight = csvg.height;
    }
    for(var i=0; i<nodules.length; i++) {
        svg += '<path d="M'+(wid/2)+' '+wdth+' C '+(wid/2)+' '+(wdth*1.6)+', '+(nodules[i])+' '+(wdth*1.6)+', '+(nodules[i])+' '+(wdth*2.3)+'" style="fill:none;stroke:#000;stroke-width:'+strk+'"/>';
        svg += '<line x1="'+nodules[i]+'" y1="'+wdth*2.7+'" x2="'+nodules[i]+'" y2="'+wdth*3+'" style="stroke:#000;stroke-width:'+strk+'"/>';
    }
    svg += '<circle cx="'+(wid/2)+'" cy="'+(wdth/2)+'" r="'+(wdth/2)+'" style="fill:#eee;stroke:#000;stroke-linecap:round;stroke-width:'+strk+'"/>';
    svg += '<text x="'+(wid/2)+'" y="'+(wdth/2+fsz/2)+'" font-size="'+fsz+'" text-anchor="middle">'+tree.attribute+'</text>';
    return {width:wid, height:maxheight+3*wdth, svg:svg};
}
function drawSVG(tree, el, el2) {
    var csvg = genSVG(tree);
    var w = csvg.width+2*mrgn, h = csvg.height+2*mrgn;
    var svg = [ '<?xml version="1.0" encoding="UTF-8" standalone="no"?>',
                '<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">',
                '<svg xmlns="http://www.w3.org/2000/svg"',
                '    viewBox="0 0 ' + w + ' ' + h + '"',
                '    width="' + w + '" height="' + h + '" preserveAspectRatio="xMidYMid meet">',
                '<g transform="translate('+mrgn+','+mrgn+')">' + csvg.svg + '</g>',
                '</svg>'].join('\n');
    el.innerHTML = svg;
    el2.innerText = svg;
    el2.textContent = svg;
}

function init() {
    var data = getData();
    var A = [];
    for(var i=1; i<data[0].length; i++) {
        A.push(i);
    }
    drawSVG(BuildTree(data, A, bestAttributeID3), document.getElementById('id3'), document.getElementById('id3-pre'));
    drawSVG(BuildTree(data, A, bestAttributeC45), document.getElementById('c45'), document.getElementById('c45-pre'));
}
document.getElementById('treebutton').onclick = init;
init();
</script>
