ã©ã³ãã ãã©ã¬ã¹ããå®è£ ãã
ã©ã³ãã ãã©ã¬ã¹ãï¼
å®è£ ãã¦ã¿ãæããæ師ãããã¼ã¿ããåé¡ã»å¦ç¿ãããã®å¾ã«ä¸ããããæªç¥ã®ãã¼ã¿ã«å¯¾ãã¦ãèå¥ã»åé¡ãè¡ãã¢ã«ã´ãªãºã ã
æ£çç㯠7-8 å²ä½ãç®å®ãä½ãè¤éãªå¦ç¿ã¯ã§ããªããä»ã®ã¢ã«ã´ãªãºã ã¨çµã¿åããã¦ä½¿ãã¨ããæãã«ãªããããã
表é¢çãªç¹å¾´ã§è¨ãã°ãæ£ççã¯7-8å²ç¨åº¦ã§ã¯ããããæ¯è¼çå¦çã軽ãã®ã¨ããã¾ã«ããä¾å¤çãªãã¼ã¿ãã¤ãºã«å¼·ãã¨ããã ã
ã¢ã«ã´ãªãºã
æ°å¼ãããªã«ããã¯ä»¥ä¸ã®ã¹ã©ã¤ãã«ä»»ããã
機会学習ハッカソン:ランダムフォレスト
è¤æ°ã®å¦ç¿å¨ã«ã©ã³ãã æ½åºã®ãã¼ã¿ãé£ããã¦å¦ç¿ããããã¼ã¿ãä¸ããããéã«ã¯ãã®å¦ç¿å¨ã®æ票ã«ãã£ã¦ãã®çµæãèå¥ããã
ãã®ãããéåæ°æªæºã®å¦ç¿å¨ããå¦ç¿ãã¼ã¿ã®ãã¼ã¿ãã¤ãºã§å¦ãªå¦ç¿ããã¦ãã¾ã£ã¦ããããç¨åº¦è£æ£ãå¹ãã¨ããã®ãå©ç¹ã
åå¼·ä¼ã®é°å²æ°
åå¼·ä¼ã§å®è£ åºæ¥ãã®ã¯ãåç©ã®èå¥ã ãã ã£ããåç©ã®èå¥ãã¿ã¼ã³ã¯å ¨ã¦äºå¤ã ã£ãã®ã§ãããããã£ãã¨ãè¨ããã
ï¼æéã§ç解ãã¦çµãã£ã¦çµæ§é¬¼ä»æ§ã ã£ãã¨ã ãè¨ã£ã¦ãããå®è£ åºæ¥ã人ã®æ¹ãå°ãªãé°å²æ°ã ã£ãã
ä¾ã«é ã£ã¦ Scala å®è£ ã ãããããªæãã ã
å®è£ (åç©ã®èå¥)
package animals import scala.io.Source case class Line(field: List[String], result: String) object Resource { def read() = { val file = Source.fromFile("/path/to/animals.dat", "utf-8") file.getLines().filterNot(_.isEmpty).map(s => { val splits = s.split("\t") Line( List( splits(0), splits(1), splits(2) ), splits(3) ) }).toList } /** * 2/3 ãã©ã³ãã ã«è¿ã * @param data * @return */ def dataSets(data: List[Line]):List[Line] = { import scala.util.Random Random.shuffle(data).take(3 /* (data.size / 3) * 2 */) // ãã¼ã¿ã»ãããä»åå°ããããã®ã§ } /** * åæã¨ã³ãããã¼è¨ç®ç¨ * @param data */ def defaultEnt(data: List[Line]): Double = { val allSize = data.size val groups = data.groupBy(_.result) def groupEnt(set: Int) = { val aq = set.toDouble / allSize.toDouble aq * Math.log(aq) } groups.map { case (n, l) => { - groupEnt(l.size) } } sum } } /** * 親ã¨ã³ãããã¼ã¨ç¾å¨ã®ã»ãããé£ããã */ case class Node(parent: Double, fields:List[Int], dataSet: List[Line]) { val currentSize = dataSet.size /** * æ大ã®ã¨ã³ãããã¼ãæã¤è³ªåIDã¨ãã®æã®ã¨ã³ãããã¼ */ val maxEntField = { // æå®ãã£ã¼ã«ãã¨æ£è§£ãä¸ãã¦æãåé¡åºæ¥ã¦ããã§ãããæ°å¤ãè¿ãã val allEnt = fields.map(i => { // ãã®ã¤ã³ããã¯ã¹ã§åé¡ããæã®ã¨ã³ãããã¼ã¯ï¼ // ãã¼ã§ï¼ã»ããä½ã val group = dataSet.groupBy(_.field(i)).map(_._2.size).toSeq (i, { parent + { group.map(_.toDouble / currentSize /* P */).map(p => p * Math.log(p) /* I */).sum } }) }) // æ¯è¼ããå 容ãã¨ã³ãããã¼ã§ã½ã¼ããã¦æå¾ã®ãã¤ã max allEnt.sortBy(_._2).last } val question = maxEntField._1 val currentEnt = maxEntField._2 val childNodes:Option[List[Node]] = { val usableNext = fields.filterNot(_ == question) val group = dataSet.groupBy(_.field(question)).map(_._2).toList.filterNot(_.isEmpty) if (usableNext.isEmpty) None else { Some( group.map(l => Node(currentEnt, usableNext, l)) ) } } /** * ãã®è§£æããªã¼ã«ãã¼ã¿ãæãã¦åé¡ãé ¼ã * @param check */ def whats(check: List[String]):String = { // ãªã¼ããªãçµæãè¿ã if (fields.isEmpty || childNodes.isEmpty) dataSet.head.result else { // ãã¼ããªãèªåã®å¤å®åºæºã«åããã¦åã®ãªã¼ãã«æãã val grp = check(question) def testHead = childNodes.get.head.whats(check) if (grp == childNodes.get.head.dataSet.head.field(question)) testHead else { if (childNodes.get.size == 1) testHead else childNodes.get.apply(1).whats(check) } } } /** * 表示ãã¦ã¿ããã */ override def toString() = s"Node($parent, $dataSet, $question, $currentEnt, $childNodes)" } object RandomForest extends App { // ãã¼ã¿ã½ã¼ã¹ val all = Resource.read() // åæã¨ã³ãããã¼ val I = Resource.defaultEnt(all) // 質åã«ä½¿ããã¤ã³ããã¯ã¹ val indexes = 0 to (all.head.field.size - 1) // 解æå¨(é©å½ã« 15 åã§ãä½ãï¼) val roots = (1 to 15).map(i => Node(I, indexes.toList, Resource.dataSets(all))) // é©å½ã«ã»ä¹³é¡ã§ãæãã¦ã¿ã¾ããããï¼ val test = List("èé£", "èç", "æ温") val results = roots.map(n => n.whats(test)) println(s"Animal: $test : $results") }
èªã¿ã¥ããã
ãªãã¡ã¯ã¿ãªãããã¦ããæããªãã£ãã®ã ããä»æ¹ç¡ãã
ã¢ã¤ã¡ã®ç¨®é¡èå¥
ãã¼ã¿
ãã¹ããã¼ã¿ã¯ã¢ã¤ã¡ã®èã£ã±ã®ãµã¤ãºã ãã㯠第三回機械学習アルゴリズム実装会 - connpassã§ç´¹ä»ããããã¼ã¿ã ã
https://github.com/watanabetanaka/randomForest/
ã¢ã¤ã¡ã®ç¨®é¡ã¯ãèãèãè±ã®ãµã¤ãºã§èå¥åºæ¥ãã¨ããçå±ãããã
package iris import scala.io.Source case class Line(id: Long, field: List[Double], result: String) object Resource { /** * ãã¡ã¤ã«ãéã * @return */ def read() = { val resource = this.getClass.getResource("../iris.dat") val file = Source.fromFile(resource.getPath, "utf-8") file.getLines().filterNot(_.isEmpty).map(s => { val splits = s.split("\t") Line( splits(0).toInt, List( splits(1).toDouble, splits(2).toDouble, splits(3).toDouble, splits(4).toDouble ), splits(5) ) }).toList } } object RandomForest { /** * 2/3 ãã©ã³ãã ã«è¿ã * @param data * @return */ def dataSets(data: List[Line]):List[Line] = { import scala.util.Random Random.shuffle(data).take((data.size / 3) * 2) } /** * åæã¨ã³ãããã¼è¨ç®ç¨ * @param data */ def defaultEnt(data: List[Line]): Double = { val allSize = data.size val groups = data.groupBy(_.result) def groupEnt(set: Int) = { val aq = set.toDouble / allSize.toDouble aq * Math.log(aq) } groups.map { case (n, l) => { - groupEnt(l.size) } } sum } trait Node { val result: String val isLeaf: Boolean def dispTree(s: String): Unit def test(dataSet: List[Double]): String = result } case class Branch(index:Int, entropy:Double, threshHold:Double, left: Option[Node], right:Option[Node], result: String) extends Node { val isLeaf:Boolean = false def dispTree(s: String) { println(s"$s ${this.toString}") val next = s"$s " left.foreach(_.dispTree(next)) right.foreach(_.dispTree(next)) } override def test(dataSet: List[Double]) = { val border = dataSet(index) < threshHold if (border) right.map(_.test(dataSet)).getOrElse(left.map(_.test(dataSet)).getOrElse("ERROR")) else left.map(_.test(dataSet)).getOrElse("ERROR") } } case class Leaf(result: String) extends Node { val isLeaf:Boolean = true def dispTree(s: String) { println(s"$s ${this.toString}") } } def createNode(i: Double, indexes: List[Int], dataSets: List[Line]):Node = { /** * çµäºæ¡ä»¶ï¼æ«ç«¯ã¾ã§å°éãã * - æ¯è¼åºæ¥ã質åããªããªã£ã * - å ¨ãã¼ã¿ãåãç©ã ã¨æè¨åºæ¥ã * @return */ def isLeaf() = indexes.isEmpty || dataSets.groupBy(_.result).size == 1 /** * ç¾å¨ã®ãã¼ã¿ãµã¤ãº * @return */ def dataSize() = dataSets.size /** * ç¾å¨ã®ãã¼ãã®å¤æ°æ´¾çµè« * @return */ def result() = dataSets.map(s => s.result -> s.result).groupBy(_._1).map(s => s._1 -> s._2.size).toList.sortBy(_._2).last._1 /** * P(x) ã®è¨ç® * @param size * @return */ def P(size: Double) = size / dataSize.toDouble /** * åã ã®ã¨ã³ãããã¼è¨ç® * @param size * @return */ def I(size: Double) = P(size) * Math.log(P(size)) // ãããçµç«¯ãªãããã£ãã¨çµæã ãè¿ãã if (isLeaf) { Leaf(result()) } else { // ããã¯æãªã®ã§ãå°è¦ç´ ãä½ã // ã©ã®ã¤ã³ããã¯ã¹ã§åé¢ããã®ãä¸çªã¨ã³ãããã¼ã大ããï¼ val ents = indexes.map(index => { // ãªã¼ãã¼ val pair = dataSets.map(s => s.field(index) -> s) // ãããå¤ val threshHold = pair.map(_._1).sum / dataSize.toDouble // ãããå¤ã§ãã¼ã¿ãåå² val data = dataSets.map(l => (l.field(index) < threshHold) -> l) val left = data.filter(_._1 == false).map(_._2) val right = data.filter(_._1).map(_._2) // ã¨ã³ãããã¼ã®è¨ç® val entropy = i + (I(right.size) + I(left.size)) if (left.isEmpty || right.isEmpty) { println("====================================") println("WARNING: Left or Right is Empty") println(s"DataSet: $dataSets") println(s"Index: $index, ThreshHold: $threshHold") println(s"Left: ${left.isEmpty}, ${right.isEmpty}") println("====================================") } (index, threshHold, entropy, left, right) }).sortBy(_._3).last val left = ents._4 val right = ents._5 def makeNode(subset: List[Line]):Option[Node] = { val subIndexes = indexes.filterNot(_ == ents._1) if (subset.isEmpty) None else Some(createNode(ents._3, subIndexes, subset)) } Branch(ents._1, ents._3, ents._2, makeNode(left), makeNode(right), result()) } } } object IrisForest extends App { // ãã¼ã¿ã½ã¼ã¹ val all = Resource.read() // åæã¨ã³ãããã¼ val I = RandomForest.defaultEnt(all) println(s"åæã¨ã³ãããã¼ : $I") // 質åã«ä½¿ããã¤ã³ããã¯ã¹ val indexes = (0 to (all.head.field.size - 1)).toList // å¦ç¿å¨ã®ä½æ val mls = (1 to 20).map(_ => RandomForest.createNode(I, indexes, RandomForest.dataSets(all))).toList mls.foreach(_.dispTree("")) // åããã¦ã¿ã // 87 6.7 3.1 4.7 1.5 versicolor val data = List(6.7, 3.1, 4.7, 1.5) println(s"TEST: 87, 6.7, 3.1, 4.7, 1.5 versicolor, Results: ${mls.map(_.test(data))}") }