[SPARK-7899] [PYSPARK] Fix Python 3 pyspark/sql/types module conflict

mnazbro · Davies Liu · commit 1c5b19827a09 · 2015-05-29T14:13:44.000-07:00
This PR makes the types module in `pyspark/sql/types` work with pylint static analysis by removing the dynamic naming of the `pyspark/sql/_types` module to `pyspark/sql/types`. Tests are now loaded using `$PYSPARK_DRIVER_PYTHON -m module` rather than `$PYSPARK_DRIVER_PYTHON module.py`. The old method adds the location of `module.py` to `sys.path`, so this change prevents accidental use of relative paths in Python. Author: Michael Nazario <mnazario@palantir.com> Closes apache#6439 from mnazario/feature/SPARK-7899 and squashes the following commits: 366ef30 [Michael Nazario] Remove hack on random.py bb8b04d [Michael Nazario] Make doctests consistent with other tests 6ee4f75 [Michael Nazario] Change test scripts to use "-m" 673528f [Michael Nazario] Move _types back to types
diff --git a/bin/pyspark b/bin/pyspark
@@ -90,11 +90,7 @@ if [[ -n "$SPARK_TESTING" ]]; then
   unset YARN_CONF_DIR
   unset HADOOP_CONF_DIR
   export PYTHONHASHSEED=0
-  if [[ -n "$PYSPARK_DOC_TEST" ]]; then
-    exec "$PYSPARK_DRIVER_PYTHON" -m doctest $1
-  else
-    exec "$PYSPARK_DRIVER_PYTHON" $1
-  fi
+  exec "$PYSPARK_DRIVER_PYTHON" -m $1
   exit
 fi
 
diff --git a/python/pyspark/accumulators.py b/python/pyspark/accumulators.py
@@ -261,3 +261,7 @@ def _start_update_server():
     thread.daemon = True
     thread.start()
     return server
+
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()
diff --git a/python/pyspark/mllib/__init__.py b/python/pyspark/mllib/__init__.py
@@ -28,11 +28,3 @@
 
 __all__ = ['classification', 'clustering', 'feature', 'fpm', 'linalg', 'random',
            'recommendation', 'regression', 'stat', 'tree', 'util']
-
-import sys
-from . import rand as random
-modname = __name__ + '.random'
-random.__name__ = modname
-random.RandomRDDs.__module__ = modname
-sys.modules[modname] = random
-del modname, sys
diff --git a/python/pyspark/mllib/random.py b/python/pyspark/mllib/random.py
diff --git a/python/pyspark/sql/__init__.py b/python/pyspark/sql/__init__.py
@@ -50,18 +50,6 @@ def deco(f):
         return f
     return deco
 
-# fix the module name conflict for Python 3+
-import sys
-from . import _types as types
-modname = __name__ + '.types'
-types.__name__ = modname
-# update the __module__ for all objects, make them picklable
-for v in types.__dict__.values():
-    if hasattr(v, "__module__") and v.__module__.endswith('._types'):
-        v.__module__ = modname
-sys.modules[modname] = types
-del modname, sys
-
 from pyspark.sql.types import Row
 from pyspark.sql.context import SQLContext, HiveContext
 from pyspark.sql.column import Column
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
diff --git a/python/run-tests b/python/run-tests
@@ -57,54 +57,54 @@ function run_test() {
 
 function run_core_tests() {
     echo "Run core tests ..."
-    run_test "pyspark/rdd.py"
-    run_test "pyspark/context.py"
-    run_test "pyspark/conf.py"
-    PYSPARK_DOC_TEST=1 run_test "pyspark/broadcast.py"
-    PYSPARK_DOC_TEST=1 run_test "pyspark/accumulators.py"
-    run_test "pyspark/serializers.py"
-    run_test "pyspark/profiler.py"
-    run_test "pyspark/shuffle.py"
-    run_test "pyspark/tests.py"
+    run_test "pyspark.rdd"
+    run_test "pyspark.context"
+    run_test "pyspark.conf"
+    run_test "pyspark.broadcast"
+    run_test "pyspark.accumulators"
+    run_test "pyspark.serializers"
+    run_test "pyspark.profiler"
+    run_test "pyspark.shuffle"
+    run_test "pyspark.tests"
 }
 
 function run_sql_tests() {
     echo "Run sql tests ..."
-    run_test "pyspark/sql/_types.py"
-    run_test "pyspark/sql/context.py"
-    run_test "pyspark/sql/column.py"
-    run_test "pyspark/sql/dataframe.py"
-    run_test "pyspark/sql/group.py"
-    run_test "pyspark/sql/functions.py"
-    run_test "pyspark/sql/tests.py"
+    run_test "pyspark.sql.types"
+    run_test "pyspark.sql.context"
+    run_test "pyspark.sql.column"
+    run_test "pyspark.sql.dataframe"
+    run_test "pyspark.sql.group"
+    run_test "pyspark.sql.functions"
+    run_test "pyspark.sql.tests"
 }
 
 function run_mllib_tests() {
     echo "Run mllib tests ..."
-    run_test "pyspark/mllib/classification.py"
-    run_test "pyspark/mllib/clustering.py"
-    run_test "pyspark/mllib/evaluation.py"
-    run_test "pyspark/mllib/feature.py"
-    run_test "pyspark/mllib/fpm.py"
-    run_test "pyspark/mllib/linalg.py"
-    run_test "pyspark/mllib/rand.py"
-    run_test "pyspark/mllib/recommendation.py"
-    run_test "pyspark/mllib/regression.py"
-    run_test "pyspark/mllib/stat/_statistics.py"
-    run_test "pyspark/mllib/tree.py"
-    run_test "pyspark/mllib/util.py"
-    run_test "pyspark/mllib/tests.py"
+    run_test "pyspark.mllib.classification"
+    run_test "pyspark.mllib.clustering"
+    run_test "pyspark.mllib.evaluation"
+    run_test "pyspark.mllib.feature"
+    run_test "pyspark.mllib.fpm"
+    run_test "pyspark.mllib.linalg"
+    run_test "pyspark.mllib.random"
+    run_test "pyspark.mllib.recommendation"
+    run_test "pyspark.mllib.regression"
+    run_test "pyspark.mllib.stat._statistics"
+    run_test "pyspark.mllib.tree"
+    run_test "pyspark.mllib.util"
+    run_test "pyspark.mllib.tests"
 }
 
 function run_ml_tests() {
     echo "Run ml tests ..."
-    run_test "pyspark/ml/feature.py"
-    run_test "pyspark/ml/classification.py"
-    run_test "pyspark/ml/recommendation.py"
-    run_test "pyspark/ml/regression.py"
-    run_test "pyspark/ml/tuning.py"
-    run_test "pyspark/ml/tests.py"
-    run_test "pyspark/ml/evaluation.py"
+    run_test "pyspark.ml.feature"
+    run_test "pyspark.ml.classification"
+    run_test "pyspark.ml.recommendation"
+    run_test "pyspark.ml.regression"
+    run_test "pyspark.ml.tuning"
+    run_test "pyspark.ml.tests"
+    run_test "pyspark.ml.evaluation"
 }
 
 function run_streaming_tests() {
@@ -124,8 +124,8 @@ function run_streaming_tests() {
     done
 
     export PYSPARK_SUBMIT_ARGS="--jars ${KAFKA_ASSEMBLY_JAR} pyspark-shell"
-    run_test "pyspark/streaming/util.py"
-    run_test "pyspark/streaming/tests.py"
+    run_test "pyspark.streaming.util"
+    run_test "pyspark.streaming.tests"
 }
 
 echo "Running PySpark tests. Output is in python/$LOG_FILE."