[PIO-193] Async support for predict and storage access, blocking code

wrapped in blocking construct
apache · longliveenduro · Oct 15, 2018 · Oct 15, 2018 · Oct 17, 2018 · Oct 23, 2018
commit b0b8a6fdcae4b49b0b4f888a35187f45ba11103c
diff --git a/core/src/main/scala/org/apache/predictionio/controller/LAlgorithm.scala b/core/src/main/scala/org/apache/predictionio/controller/LAlgorithm.scala
@@ -24,6 +24,9 @@ import org.apache.predictionio.workflow.PersistentModelManifest
 import org.apache.spark.SparkContext
 import org.apache.spark.rdd.RDD
 
+import scala.concurrent.duration._
+import scala.concurrent.{Await, ExecutionContext, Future, blocking}
+import scala.language.postfixOps
 import scala.reflect._
 
 /** Base class of a local algorithm.
@@ -72,11 +75,13 @@ abstract class LAlgorithm[PD, M : ClassTag, Q, P]
     val glomQs: RDD[Array[(Long, Q)]] = qs.glom()
     val cartesian: RDD[(M, Array[(Long, Q)])] = mRDD.cartesian(glomQs)
     cartesian.flatMap { case (m, qArray) =>
-      qArray.map { case (qx, q) => (qx, predict(m, q)) }
+      qArray.map {
+        case (qx, q) => (qx, Await.result(predict(m, q)(scala.concurrent.ExecutionContext.global), 60 minutes) )
+      }
     }
   }
 
-  def predictBase(localBaseModel: Any, q: Q): P = {
+  def predictBase(localBaseModel: Any, q: Q)(implicit ec: ExecutionContext): Future[P] = {
     predict(localBaseModel.asInstanceOf[M], q)
   }
 
@@ -87,7 +92,7 @@ abstract class LAlgorithm[PD, M : ClassTag, Q, P]
     * @param q An input query.
     * @return A prediction.
     */
-  def predict(m: M, q: Q): P
+  def predict(m: M, q: Q)(implicit ec: ExecutionContext): Future[P]
 
   /** :: DeveloperApi ::
     * Engine developers should not use this directly (read on to see how local

diff --git a/core/src/main/scala/org/apache/predictionio/controller/P2LAlgorithm.scala b/core/src/main/scala/org/apache/predictionio/controller/P2LAlgorithm.scala
@@ -25,6 +25,9 @@ import org.apache.spark.SparkContext
 import org.apache.spark.SparkContext._
 import org.apache.spark.rdd.RDD
 
+import scala.concurrent.duration._
+import scala.concurrent.{Await, ExecutionContext, Future, blocking}
+import scala.language.postfixOps
 import scala.reflect._
 
 /** Base class of a parallel-to-local algorithm.
@@ -67,10 +70,10 @@ abstract class P2LAlgorithm[PD, M: ClassTag, Q: ClassTag, P]
     * @return Batch of predicted results
     */
   def batchPredict(m: M, qs: RDD[(Long, Q)]): RDD[(Long, P)] = {
-    qs.mapValues { q => predict(m, q) }
+    qs.mapValues { q => Await.result(predict(m, q)(scala.concurrent.ExecutionContext.global), 60 minutes) }
   }
 
-  def predictBase(bm: Any, q: Q): P = predict(bm.asInstanceOf[M], q)
+  def predictBase(bm: Any, q: Q)(implicit ec: ExecutionContext): Future[P] = predict(bm.asInstanceOf[M], q)
 
   /** Implement this method to produce a prediction from a query and trained
     * model.
@@ -79,7 +82,7 @@ abstract class P2LAlgorithm[PD, M: ClassTag, Q: ClassTag, P]
     * @param query An input query.
     * @return A prediction.
     */
-  def predict(model: M, query: Q): P
+  def predict(model: M, query: Q)(implicit ec: ExecutionContext): Future[P]
 
   /** :: DeveloperApi ::
     * Engine developers should not use this directly (read on to see how

diff --git a/core/src/main/scala/org/apache/predictionio/controller/PAlgorithm.scala b/core/src/main/scala/org/apache/predictionio/controller/PAlgorithm.scala
@@ -24,6 +24,8 @@ import org.apache.predictionio.workflow.PersistentModelManifest
 import org.apache.spark.SparkContext
 import org.apache.spark.rdd.RDD
 
+import scala.concurrent.{ExecutionContext, Future}
+
 /** Base class of a parallel algorithm.
   *
   * A parallel algorithm can be run in parallel on a cluster and produces a
@@ -72,7 +74,7 @@ abstract class PAlgorithm[PD, M, Q, P]
   def batchPredict(m: M, qs: RDD[(Long, Q)]): RDD[(Long, P)] =
     throw new NotImplementedError("batchPredict not implemented")
 
-  def predictBase(baseModel: Any, query: Q): P = {
+  def predictBase(baseModel: Any, query: Q)(implicit ec: ExecutionContext): Future[P] = {
     predict(baseModel.asInstanceOf[M], query)
   }
 
@@ -83,7 +85,7 @@ abstract class PAlgorithm[PD, M, Q, P]
     * @param query An input query.
     * @return A prediction.
     */
-  def predict(model: M, query: Q): P
+  def predict(model: M, query: Q)(implicit ec: ExecutionContext): Future[P]
 
   /** :: DeveloperApi ::
     * Engine developers should not use this directly (read on to see how parallel

diff --git a/core/src/main/scala/org/apache/predictionio/core/BaseAlgorithm.scala b/core/src/main/scala/org/apache/predictionio/core/BaseAlgorithm.scala
@@ -26,6 +26,8 @@ import net.jodah.typetools.TypeResolver
 import org.apache.spark.SparkContext
 import org.apache.spark.rdd.RDD
 
+import scala.concurrent.{ExecutionContext, Future}
+
 /** :: DeveloperApi ::
   * Base trait with default custom query serializer, exposed to engine developer
   * via [[org.apache.predictionio.controller.CustomQuerySerializer]]
@@ -90,7 +92,7 @@ abstract class BaseAlgorithm[PD, M, Q, P]
     * @return Predicted result
     */
   @DeveloperApi
-  def predictBase(bm: Any, q: Q): P
+  def predictBase(bm: Any, q: Q)(implicit ec: ExecutionContext): Future[P]
 
   /** :: DeveloperApi ::
     * Engine developers should not use this directly. Prepare a model for

diff --git a/core/src/main/scala/org/apache/predictionio/workflow/BatchPredict.scala b/core/src/main/scala/org/apache/predictionio/workflow/BatchPredict.scala
@@ -32,7 +32,12 @@ import org.apache.predictionio.workflow.CleanupFunctions
 import org.apache.spark.rdd.RDD
 import org.json4s._
 import org.json4s.native.JsonMethods._
+import scala.concurrent.duration._
+import scala.language.postfixOps
+import scala.concurrent.blocking
+import scala.concurrent.{Await, Future}
 import scala.language.existentials
+import scala.concurrent.ExecutionContext.Implicits.global
 
 case class BatchPredictConfig(
   inputFilePath: String = "batchpredict-input.json",
@@ -207,23 +212,26 @@ object BatchPredict extends Logging {
         // Deploy logic. First call Serving.supplement, then Algo.predict,
         // finally Serving.serve.
         val supplementedQuery = serving.supplementBase(query)
-        // TODO: Parallelize the following.
-        val predictions = algorithms.zip(models).map { case (a, m) =>
+        val predictionsFuture = Future.sequence(algorithms.zip(models).map { case (a, m) =>
           a.predictBase(m, supplementedQuery)
-        }
+        })
         // Notice that it is by design to call Serving.serve with the
         // *original* query.
-        val prediction = serving.serveBase(query, predictions)
-        // Combine query with prediction, so the batch results are
-        // self-descriptive.
-        val predictionJValue = JsonExtractor.toJValue(
-          jsonExtractorOption,
-          Map("query" -> query,
-              "prediction" -> prediction),
-          algorithms.head.querySerializer,
-          algorithms.head.gsonTypeAdapterFactories)
-        // Return JSON string
-        compact(render(predictionJValue))
+        val predFutureRdds = predictionsFuture.map {
+          predictions =>
+            val prediction = serving.serveBase(query, predictions)
+            // Combine query with prediction, so the batch results are
+            // self-descriptive.
+            val predictionJValue = JsonExtractor.toJValue(
+              jsonExtractorOption,
+              Map("query" -> query,
+                  "prediction" -> prediction),
+              algorithms.head.querySerializer,
+              algorithms.head.gsonTypeAdapterFactories)
+            // Return JSON string
+            compact(render(predictionJValue))
+        }
+        Await.result(predFutureRdds, 60 minutes)
       }
 
       predictionsRDD.saveAsTextFile(config.outputFilePath)