[SPARK-24324][PYTHON][FOLLOW-UP] Rename the Conf to spark.sql.legacy.execution.pandas.groupedMap.assignColumnsByName

gatorsmile · HyukjinKwon · HyukjinKwon · commit 8c2edf46d0f8 · 2018-09-26T09:32:51.000+08:00
## What changes were proposed in this pull request? Add the legacy prefix for spark.sql.execution.pandas.groupedMap.assignColumnsByPosition and rename it to spark.sql.legacy.execution.pandas.groupedMap.assignColumnsByName ## How was this patch tested? The existing tests. Closes apache#22540 from gatorsmile/renameAssignColumnsByPosition. Lead-authored-by: gatorsmile <gatorsmile@gmail.com> Co-authored-by: Hyukjin Kwon <gurwls223@gmail.com> Signed-off-by: hyukjinkwon <gurwls223@apache.org>
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
@@ -5802,7 +5802,8 @@ def test_positional_assignment_conf(self):
         import pandas as pd
         from pyspark.sql.functions import pandas_udf, PandasUDFType
 
-        with self.sql_conf({"spark.sql.execution.pandas.groupedMap.assignColumnsByPosition": True}):
+        with self.sql_conf({
+                "spark.sql.legacy.execution.pandas.groupedMap.assignColumnsByName": False}):
 
             @pandas_udf("a string, b float", PandasUDFType.GROUPED_MAP)
             def foo(_):
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
@@ -97,8 +97,9 @@ def verify_result_length(*a):
 
 
 def wrap_grouped_map_pandas_udf(f, return_type, argspec, runner_conf):
-    assign_cols_by_pos = runner_conf.get(
-        "spark.sql.execution.pandas.groupedMap.assignColumnsByPosition", False)
+    assign_cols_by_name = runner_conf.get(
+        "spark.sql.legacy.execution.pandas.groupedMap.assignColumnsByName", "true")
+    assign_cols_by_name = assign_cols_by_name.lower() == "true"
 
     def wrapped(key_series, value_series):
         import pandas as pd
@@ -119,7 +120,7 @@ def wrapped(key_series, value_series):
                 "Expected: {} Actual: {}".format(len(return_type), len(result.columns)))
 
         # Assign result columns by schema name if user labeled with strings, else use position
-        if not assign_cols_by_pos and any(isinstance(name, basestring) for name in result.columns):
+        if assign_cols_by_name and any(isinstance(name, basestring) for name in result.columns):
             return [(result[field.name], to_arrow_type(field.dataType)) for field in return_type]
         else:
             return [(result[result.columns[i]], to_arrow_type(field.dataType))
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -1295,15 +1295,15 @@ object SQLConf {
       .booleanConf
       .createWithDefault(true)
 
-  val PANDAS_GROUPED_MAP_ASSIGN_COLUMNS_BY_POSITION =
-    buildConf("spark.sql.execution.pandas.groupedMap.assignColumnsByPosition")
+  val PANDAS_GROUPED_MAP_ASSIGN_COLUMNS_BY_NAME =
+    buildConf("spark.sql.legacy.execution.pandas.groupedMap.assignColumnsByName")
       .internal()
-      .doc("When true, a grouped map Pandas UDF will assign columns from the returned " +
-        "Pandas DataFrame based on position, regardless of column label type. When false, " +
-        "columns will be looked up by name if labeled with a string and fallback to use " +
-        "position if not. This configuration will be deprecated in future releases.")
+      .doc("When true, columns will be looked up by name if labeled with a string and fallback " +
+        "to use position if not. When false, a grouped map Pandas UDF will assign columns from " +
+        "the returned Pandas DataFrame based on position, regardless of column label type. " +
+        "This configuration will be deprecated in future releases.")
       .booleanConf
-      .createWithDefault(false)
+      .createWithDefault(true)
 
   val REPLACE_EXCEPT_WITH_FILTER = buildConf("spark.sql.optimizer.replaceExceptWithFilter")
     .internal()
@@ -1915,8 +1915,8 @@ class SQLConf extends Serializable with Logging {
 
   def pandasRespectSessionTimeZone: Boolean = getConf(PANDAS_RESPECT_SESSION_LOCAL_TIMEZONE)
 
-  def pandasGroupedMapAssignColumnssByPosition: Boolean =
-    getConf(SQLConf.PANDAS_GROUPED_MAP_ASSIGN_COLUMNS_BY_POSITION)
+  def pandasGroupedMapAssignColumnsByName: Boolean =
+    getConf(SQLConf.PANDAS_GROUPED_MAP_ASSIGN_COLUMNS_BY_NAME)
 
   def replaceExceptWithFilter: Boolean = getConf(REPLACE_EXCEPT_WITH_FILTER)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowUtils.scala
@@ -131,11 +131,8 @@ object ArrowUtils {
     } else {
       Nil
     }
-    val pandasColsByPosition = if (conf.pandasGroupedMapAssignColumnssByPosition) {
-      Seq(SQLConf.PANDAS_GROUPED_MAP_ASSIGN_COLUMNS_BY_POSITION.key -> "true")
-    } else {
-      Nil
-    }
-    Map(timeZoneConf ++ pandasColsByPosition: _*)
+    val pandasColsByName = Seq(SQLConf.PANDAS_GROUPED_MAP_ASSIGN_COLUMNS_BY_NAME.key ->
+      conf.pandasGroupedMapAssignColumnsByName.toString)
+    Map(timeZoneConf ++ pandasColsByName: _*)
   }
 }

Original file line number	Diff line number	Diff line change
`@@ -131,11 +131,8 @@ object ArrowUtils {`
`131`	`131`	`} else {`
`132`	`132`	`Nil`
`133`	`133`	`}`
`134`		`- val pandasColsByPosition = if (conf.pandasGroupedMapAssignColumnssByPosition) {`
`135`		`- Seq(SQLConf.PANDAS_GROUPED_MAP_ASSIGN_COLUMNS_BY_POSITION.key -> "true")`
`136`		`- } else {`
`137`		`- Nil`
`138`		`- }`
`139`		`- Map(timeZoneConf ++ pandasColsByPosition: _*)`
	`134`	`+ val pandasColsByName = Seq(SQLConf.PANDAS_GROUPED_MAP_ASSIGN_COLUMNS_BY_NAME.key ->`
	`135`	`+ conf.pandasGroupedMapAssignColumnsByName.toString)`
	`136`	`+ Map(timeZoneConf ++ pandasColsByName: _*)`
`140`	`137`	`}`
`141`	`138`	`}`