diff options
| author | CoprDistGit <infra@openeuler.org> | 2023-04-10 16:17:35 +0000 |
|---|---|---|
| committer | CoprDistGit <infra@openeuler.org> | 2023-04-10 16:17:35 +0000 |
| commit | eeb83c47c17bd5a0ccda27c4291983516b3c5574 (patch) | |
| tree | 9d8d38b0a2bdabc3295a87ad7f1b1b8d6b1c0818 /python-sparkaid.spec | |
| parent | 747638bc305cd18f2d3dbcfd250ba52507e1a9ce (diff) | |
automatic import of python-sparkaid
Diffstat (limited to 'python-sparkaid.spec')
| -rw-r--r-- | python-sparkaid.spec | 576 |
1 files changed, 576 insertions, 0 deletions
diff --git a/python-sparkaid.spec b/python-sparkaid.spec new file mode 100644 index 0000000..55cbc48 --- /dev/null +++ b/python-sparkaid.spec @@ -0,0 +1,576 @@ +%global _empty_manifest_terminate_build 0 +Name: python-sparkaid +Version: 1.0.0 +Release: 1 +Summary: Utils for working with Spark +License: GNU Lesser General Public License v3 or later (LGPLv3+) +URL: https://github.com/lvhuyen/SparkAid +Source0: https://mirrors.nju.edu.cn/pypi/web/packages/c1/1e/92e74e641719430d3d6216e4eef34e90a89876080c2013d8a5aa557b4284/sparkaid-1.0.0.tar.gz +BuildArch: noarch + +Requires: python3-pyspark +Requires: python3-pip-tools +Requires: python3-pytest + +%description +## Flattening +### StructType +Sample DataFrame: + from pyspark.sql import Row + from pyspark.sql.functions import col + df_struct = spark.createDataFrame([Row(structA=Row(field1=10, field2=1.5), + structB=Row(field3="one",field4=False))]) + df_struct.printSchema() + root + |-- structA: struct (nullable = true) + | |-- field1: long (nullable = true) + | |-- field2: double (nullable = true) + |-- structB: struct (nullable = true) + | |-- field3: string (nullable = true) + | |-- field4: boolean (nullable = true) +Spark allows selecting nested columns by using the dot `.` notation: + df_struct.select("structA.*", "structB.field3").printSchema() + root + |-- field1: long (nullable = true) + |-- field2: double (nullable = true) + |-- field3: string (nullable = true) +Please note here that the current Spark implementation (2.4.3 or below) doesn't keep the outer layer fieldname (e.g: structA) in the output dataframe +### ArrayType +To select only some elements from an ArrayType column, either *`getItem()`* or using brackets (as selecting elements from a legacy array: `[]` in Python `()` in Scala) would do the trick: + df_array = spark.createDataFrame([Row(arrayA=[1,2,3,4,5],fieldB="foo")]) + df_array.select(col("arrayA").getItem(0).alias("element0"), col("arrayA")[4].alias("element5"), col("fieldB")).show() + +--------+--------+------+ + |element0|element5|fieldB| + +--------+--------+------+ + | 1| 5| foo| + +--------+--------+------+ +### MapType +Elements from a MapType column can be selected the same way as in the case of ArrayType, but using the key instead of index number. The dot notation (`.`) could also be used instead of `getItem()` or brackets: + df_map = spark.createDataFrame([Row(mapA={2: "TWO", 3: "THREE", 0: "ZERO"}, fieldB="foo")]) + df_map.select(col("mapA")[3].alias("element3"), col("mapA").getItem(2).alias("element2"), col("mapA.0").alias("element0"), col("mapA").getItem(1).alias("element1")).show() + +--------+--------+--------+--------+ + |element3|element2|element0|element1| + +--------+--------+--------+--------+ + | THREE| TWO| ZERO| null| + +--------+--------+--------+--------+ +### StructType nested in StructType +As Spark DataFrame.select() supports passing an array of columns to be selected, to fully unflatten a multi-layer nested dataframe, a recursive call would do the trick. +Here is a detailed discussion on StackOverFlow on how to do this: +https://stackoverflow.com/questions/37471346/automatically-and-elegantly-flatten-dataframe-in-spark-sql +### StructType nested in ArrayType + df_nested = spark.createDataFrame([ + Row( + arrayA=[ + Row(childStructB=Row(field1=1, field2="foo")), + Row(childStructB=Row(field1=2, field2="bar")) + ] + )]) + df_nested.printSchema() + root + |-- arrayA: array (nullable = true) + | |-- element: struct (containsNull = true) + | | |-- childStructB: struct (nullable = true) + | | | |-- field1: long (nullable = true) + | | | |-- field2: string (nullable = true) + df_nested.show(1, False) + +------------------------+ + |arrayA | + +------------------------+ + |[[[1, foo]], [[2, bar]]]| + +------------------------+ +Selecting *field1* or *field2* can be done as with normal structs (not nested inside an array), by using that dot `.` annotation. The result would be of the type `ArrayType[ChildFieldType]`, which has been **_vertically sliced_** from the original array + df_child = df_nested.select("arrayA.childStructB.field1", "arrayA.childStructB.field2") + df_child.printSchema() + root + |-- field1: array (nullable = true) + | |-- element: long (containsNull = true) + |-- field2: array (nullable = true) + | |-- element: string (containsNull = true) + df_child.show() + +------+----------+ + |field1| field2| + +------+----------+ + |[1, 2]|[foo, bar]| + +------+----------+ +### StructType nested in MapType +As each MapType column has two components, the keys and the values, selecting nested column inside a MapType column is not straight forward - we cannot just use that `.` to take the nested fields as that has already been used for denoting the key. + df_map_nested = spark.createDataFrame([Row(mapA={"2": Row(type_name="Arabic number", equivalent=2), "THREE": Row(type_name="English Text", equivalent=3)}, fieldB="foo")]) + df_map_nested.select(col("mapA.type_name"), col("mapA.THREE.type_name")).show() + +---------+------------+ + |type_name| type_name| + +---------+------------+ + | null|English Text| + +---------+------------+ +A solution for this is to use the builtin function `map_values()` which has been introduced since Spark 2.3. Note the type of the result column: ArrayType + from pyspark.sql.functions import map_values + result = df_map_nested.select(map_values("mapA")["type_name"], col("mapA.THREE.type_name")) + result.show(2,False) + result.printSchema() + +-----------------------------+------------+ + |map_values(mapA).type_name |type_name | + +-----------------------------+------------+ + |[Arabic number, English Text]|English Text| + +-----------------------------+------------+ + root + |-- map_values(mapA).type_name: array (nullable = true) + | |-- element: string (containsNull = true) + |-- type_name: string (nullable = true) +## Hurdles +The above steps would work well for most of dataframes. The only dataframes that it fails (as of Spark 2.4.3 or lower) are the ones with a StructType nested inside MORE THAN ONE layers of ArrayType. +Like this one: + df_nested_B = spark.createDataFrame([ + Row( + arrayA=[[ + Row(childStructB=Row(field1=1, field2="foo")), + Row(childStructB=Row(field1=2, field2="bar")) + ]] + )]) + df_nested_B.printSchema() + root + |-- arrayA: array (nullable = true) + | |-- element: array (containsNull = true) + | | |-- element: struct (containsNull = true) + | | | |-- childStructB: struct (nullable = true) + | | | | |-- field1: long (nullable = true) + | | | | |-- field2: string (nullable = true) +Or this one + df_nested_C = spark.createDataFrame([ + Row( + arrayA=[ + Row(childStructB=Row(childArrayC=[Row(field1=1, field2="foo")])), + Row(childStructB=Row(childArrayC=[Row(field1=2, field2="bar")])), + ] + )]) + df_nested_C.printSchema() + root + |-- arrayA: array (nullable = true) + | |-- element: struct (containsNull = true) + | | |-- childStructB: struct (nullable = true) + | | | |-- childArrayC: array (nullable = true) + | | | | |-- element: struct (containsNull = true) + | | | | | |-- field1: long (nullable = true) + | | | | | |-- field2: string (nullable = true) +Selecting `arrayA.childStructB.field1` from `df_nested_B` fails with the error message: `AnalysisException: No such struct field field1 in childStructB`.<br> +While selecting `arrayA.childStructB.childArrayC.field1` from `df_nested_C` throws the `AnalysisException`: `cannot resolve 'arrayA.childStructB.childArrayC['field1']' due to data type mismatch: argument 2 requires integral type, however, ''field1'' is of string type.` +## (More) Solutions +With the introduction of the SQL function `transform` in Spark 2.4, the error above can be solved by applying `transform` on every layer of the array. +A comprehensive implementation of a flatten function can be found in the Python package `sparkaid`: + from sparkaid import flatten + flatten(df_nested_B).printSchema() + root + |-- arrayA__childStructB_field1: array (nullable = true) + | |-- element: array (containsNull = true) + | | |-- element: long (containsNull = true) + |-- arrayA__childStructB_field2: array (nullable = true) + | |-- element: array (containsNull = true) + | | |-- element: string (containsNull = true) +<p> + flatten(df_nested_B).show() + +---------------------------+---------------------------+ + |arrayA__childStructB_field1|arrayA__childStructB_field2| + +---------------------------+---------------------------+ + | [[1, 2]]| [[foo, bar]]| + +---------------------------+---------------------------+ +<p> + flatten(df_nested_C).printSchema() + root + |-- arrayA_childStructB_childArrayC_field1: array (nullable = true) + | |-- element: array (containsNull = true) + | | |-- element: long (containsNull = true) + |-- arrayA_childStructB_childArrayC_field2: array (nullable = true) + | |-- element: array (containsNull = true) + | | |-- element: string (containsNull = true) +[]: https://spark.apache.org/docs/2.4.0/api/java/org/apache/spark/sql/functions.html#flatten-org.apache.spark.sql.Column + +%package -n python3-sparkaid +Summary: Utils for working with Spark +Provides: python-sparkaid +BuildRequires: python3-devel +BuildRequires: python3-setuptools +BuildRequires: python3-pip +%description -n python3-sparkaid +## Flattening +### StructType +Sample DataFrame: + from pyspark.sql import Row + from pyspark.sql.functions import col + df_struct = spark.createDataFrame([Row(structA=Row(field1=10, field2=1.5), + structB=Row(field3="one",field4=False))]) + df_struct.printSchema() + root + |-- structA: struct (nullable = true) + | |-- field1: long (nullable = true) + | |-- field2: double (nullable = true) + |-- structB: struct (nullable = true) + | |-- field3: string (nullable = true) + | |-- field4: boolean (nullable = true) +Spark allows selecting nested columns by using the dot `.` notation: + df_struct.select("structA.*", "structB.field3").printSchema() + root + |-- field1: long (nullable = true) + |-- field2: double (nullable = true) + |-- field3: string (nullable = true) +Please note here that the current Spark implementation (2.4.3 or below) doesn't keep the outer layer fieldname (e.g: structA) in the output dataframe +### ArrayType +To select only some elements from an ArrayType column, either *`getItem()`* or using brackets (as selecting elements from a legacy array: `[]` in Python `()` in Scala) would do the trick: + df_array = spark.createDataFrame([Row(arrayA=[1,2,3,4,5],fieldB="foo")]) + df_array.select(col("arrayA").getItem(0).alias("element0"), col("arrayA")[4].alias("element5"), col("fieldB")).show() + +--------+--------+------+ + |element0|element5|fieldB| + +--------+--------+------+ + | 1| 5| foo| + +--------+--------+------+ +### MapType +Elements from a MapType column can be selected the same way as in the case of ArrayType, but using the key instead of index number. The dot notation (`.`) could also be used instead of `getItem()` or brackets: + df_map = spark.createDataFrame([Row(mapA={2: "TWO", 3: "THREE", 0: "ZERO"}, fieldB="foo")]) + df_map.select(col("mapA")[3].alias("element3"), col("mapA").getItem(2).alias("element2"), col("mapA.0").alias("element0"), col("mapA").getItem(1).alias("element1")).show() + +--------+--------+--------+--------+ + |element3|element2|element0|element1| + +--------+--------+--------+--------+ + | THREE| TWO| ZERO| null| + +--------+--------+--------+--------+ +### StructType nested in StructType +As Spark DataFrame.select() supports passing an array of columns to be selected, to fully unflatten a multi-layer nested dataframe, a recursive call would do the trick. +Here is a detailed discussion on StackOverFlow on how to do this: +https://stackoverflow.com/questions/37471346/automatically-and-elegantly-flatten-dataframe-in-spark-sql +### StructType nested in ArrayType + df_nested = spark.createDataFrame([ + Row( + arrayA=[ + Row(childStructB=Row(field1=1, field2="foo")), + Row(childStructB=Row(field1=2, field2="bar")) + ] + )]) + df_nested.printSchema() + root + |-- arrayA: array (nullable = true) + | |-- element: struct (containsNull = true) + | | |-- childStructB: struct (nullable = true) + | | | |-- field1: long (nullable = true) + | | | |-- field2: string (nullable = true) + df_nested.show(1, False) + +------------------------+ + |arrayA | + +------------------------+ + |[[[1, foo]], [[2, bar]]]| + +------------------------+ +Selecting *field1* or *field2* can be done as with normal structs (not nested inside an array), by using that dot `.` annotation. The result would be of the type `ArrayType[ChildFieldType]`, which has been **_vertically sliced_** from the original array + df_child = df_nested.select("arrayA.childStructB.field1", "arrayA.childStructB.field2") + df_child.printSchema() + root + |-- field1: array (nullable = true) + | |-- element: long (containsNull = true) + |-- field2: array (nullable = true) + | |-- element: string (containsNull = true) + df_child.show() + +------+----------+ + |field1| field2| + +------+----------+ + |[1, 2]|[foo, bar]| + +------+----------+ +### StructType nested in MapType +As each MapType column has two components, the keys and the values, selecting nested column inside a MapType column is not straight forward - we cannot just use that `.` to take the nested fields as that has already been used for denoting the key. + df_map_nested = spark.createDataFrame([Row(mapA={"2": Row(type_name="Arabic number", equivalent=2), "THREE": Row(type_name="English Text", equivalent=3)}, fieldB="foo")]) + df_map_nested.select(col("mapA.type_name"), col("mapA.THREE.type_name")).show() + +---------+------------+ + |type_name| type_name| + +---------+------------+ + | null|English Text| + +---------+------------+ +A solution for this is to use the builtin function `map_values()` which has been introduced since Spark 2.3. Note the type of the result column: ArrayType + from pyspark.sql.functions import map_values + result = df_map_nested.select(map_values("mapA")["type_name"], col("mapA.THREE.type_name")) + result.show(2,False) + result.printSchema() + +-----------------------------+------------+ + |map_values(mapA).type_name |type_name | + +-----------------------------+------------+ + |[Arabic number, English Text]|English Text| + +-----------------------------+------------+ + root + |-- map_values(mapA).type_name: array (nullable = true) + | |-- element: string (containsNull = true) + |-- type_name: string (nullable = true) +## Hurdles +The above steps would work well for most of dataframes. The only dataframes that it fails (as of Spark 2.4.3 or lower) are the ones with a StructType nested inside MORE THAN ONE layers of ArrayType. +Like this one: + df_nested_B = spark.createDataFrame([ + Row( + arrayA=[[ + Row(childStructB=Row(field1=1, field2="foo")), + Row(childStructB=Row(field1=2, field2="bar")) + ]] + )]) + df_nested_B.printSchema() + root + |-- arrayA: array (nullable = true) + | |-- element: array (containsNull = true) + | | |-- element: struct (containsNull = true) + | | | |-- childStructB: struct (nullable = true) + | | | | |-- field1: long (nullable = true) + | | | | |-- field2: string (nullable = true) +Or this one + df_nested_C = spark.createDataFrame([ + Row( + arrayA=[ + Row(childStructB=Row(childArrayC=[Row(field1=1, field2="foo")])), + Row(childStructB=Row(childArrayC=[Row(field1=2, field2="bar")])), + ] + )]) + df_nested_C.printSchema() + root + |-- arrayA: array (nullable = true) + | |-- element: struct (containsNull = true) + | | |-- childStructB: struct (nullable = true) + | | | |-- childArrayC: array (nullable = true) + | | | | |-- element: struct (containsNull = true) + | | | | | |-- field1: long (nullable = true) + | | | | | |-- field2: string (nullable = true) +Selecting `arrayA.childStructB.field1` from `df_nested_B` fails with the error message: `AnalysisException: No such struct field field1 in childStructB`.<br> +While selecting `arrayA.childStructB.childArrayC.field1` from `df_nested_C` throws the `AnalysisException`: `cannot resolve 'arrayA.childStructB.childArrayC['field1']' due to data type mismatch: argument 2 requires integral type, however, ''field1'' is of string type.` +## (More) Solutions +With the introduction of the SQL function `transform` in Spark 2.4, the error above can be solved by applying `transform` on every layer of the array. +A comprehensive implementation of a flatten function can be found in the Python package `sparkaid`: + from sparkaid import flatten + flatten(df_nested_B).printSchema() + root + |-- arrayA__childStructB_field1: array (nullable = true) + | |-- element: array (containsNull = true) + | | |-- element: long (containsNull = true) + |-- arrayA__childStructB_field2: array (nullable = true) + | |-- element: array (containsNull = true) + | | |-- element: string (containsNull = true) +<p> + flatten(df_nested_B).show() + +---------------------------+---------------------------+ + |arrayA__childStructB_field1|arrayA__childStructB_field2| + +---------------------------+---------------------------+ + | [[1, 2]]| [[foo, bar]]| + +---------------------------+---------------------------+ +<p> + flatten(df_nested_C).printSchema() + root + |-- arrayA_childStructB_childArrayC_field1: array (nullable = true) + | |-- element: array (containsNull = true) + | | |-- element: long (containsNull = true) + |-- arrayA_childStructB_childArrayC_field2: array (nullable = true) + | |-- element: array (containsNull = true) + | | |-- element: string (containsNull = true) +[]: https://spark.apache.org/docs/2.4.0/api/java/org/apache/spark/sql/functions.html#flatten-org.apache.spark.sql.Column + +%package help +Summary: Development documents and examples for sparkaid +Provides: python3-sparkaid-doc +%description help +## Flattening +### StructType +Sample DataFrame: + from pyspark.sql import Row + from pyspark.sql.functions import col + df_struct = spark.createDataFrame([Row(structA=Row(field1=10, field2=1.5), + structB=Row(field3="one",field4=False))]) + df_struct.printSchema() + root + |-- structA: struct (nullable = true) + | |-- field1: long (nullable = true) + | |-- field2: double (nullable = true) + |-- structB: struct (nullable = true) + | |-- field3: string (nullable = true) + | |-- field4: boolean (nullable = true) +Spark allows selecting nested columns by using the dot `.` notation: + df_struct.select("structA.*", "structB.field3").printSchema() + root + |-- field1: long (nullable = true) + |-- field2: double (nullable = true) + |-- field3: string (nullable = true) +Please note here that the current Spark implementation (2.4.3 or below) doesn't keep the outer layer fieldname (e.g: structA) in the output dataframe +### ArrayType +To select only some elements from an ArrayType column, either *`getItem()`* or using brackets (as selecting elements from a legacy array: `[]` in Python `()` in Scala) would do the trick: + df_array = spark.createDataFrame([Row(arrayA=[1,2,3,4,5],fieldB="foo")]) + df_array.select(col("arrayA").getItem(0).alias("element0"), col("arrayA")[4].alias("element5"), col("fieldB")).show() + +--------+--------+------+ + |element0|element5|fieldB| + +--------+--------+------+ + | 1| 5| foo| + +--------+--------+------+ +### MapType +Elements from a MapType column can be selected the same way as in the case of ArrayType, but using the key instead of index number. The dot notation (`.`) could also be used instead of `getItem()` or brackets: + df_map = spark.createDataFrame([Row(mapA={2: "TWO", 3: "THREE", 0: "ZERO"}, fieldB="foo")]) + df_map.select(col("mapA")[3].alias("element3"), col("mapA").getItem(2).alias("element2"), col("mapA.0").alias("element0"), col("mapA").getItem(1).alias("element1")).show() + +--------+--------+--------+--------+ + |element3|element2|element0|element1| + +--------+--------+--------+--------+ + | THREE| TWO| ZERO| null| + +--------+--------+--------+--------+ +### StructType nested in StructType +As Spark DataFrame.select() supports passing an array of columns to be selected, to fully unflatten a multi-layer nested dataframe, a recursive call would do the trick. +Here is a detailed discussion on StackOverFlow on how to do this: +https://stackoverflow.com/questions/37471346/automatically-and-elegantly-flatten-dataframe-in-spark-sql +### StructType nested in ArrayType + df_nested = spark.createDataFrame([ + Row( + arrayA=[ + Row(childStructB=Row(field1=1, field2="foo")), + Row(childStructB=Row(field1=2, field2="bar")) + ] + )]) + df_nested.printSchema() + root + |-- arrayA: array (nullable = true) + | |-- element: struct (containsNull = true) + | | |-- childStructB: struct (nullable = true) + | | | |-- field1: long (nullable = true) + | | | |-- field2: string (nullable = true) + df_nested.show(1, False) + +------------------------+ + |arrayA | + +------------------------+ + |[[[1, foo]], [[2, bar]]]| + +------------------------+ +Selecting *field1* or *field2* can be done as with normal structs (not nested inside an array), by using that dot `.` annotation. The result would be of the type `ArrayType[ChildFieldType]`, which has been **_vertically sliced_** from the original array + df_child = df_nested.select("arrayA.childStructB.field1", "arrayA.childStructB.field2") + df_child.printSchema() + root + |-- field1: array (nullable = true) + | |-- element: long (containsNull = true) + |-- field2: array (nullable = true) + | |-- element: string (containsNull = true) + df_child.show() + +------+----------+ + |field1| field2| + +------+----------+ + |[1, 2]|[foo, bar]| + +------+----------+ +### StructType nested in MapType +As each MapType column has two components, the keys and the values, selecting nested column inside a MapType column is not straight forward - we cannot just use that `.` to take the nested fields as that has already been used for denoting the key. + df_map_nested = spark.createDataFrame([Row(mapA={"2": Row(type_name="Arabic number", equivalent=2), "THREE": Row(type_name="English Text", equivalent=3)}, fieldB="foo")]) + df_map_nested.select(col("mapA.type_name"), col("mapA.THREE.type_name")).show() + +---------+------------+ + |type_name| type_name| + +---------+------------+ + | null|English Text| + +---------+------------+ +A solution for this is to use the builtin function `map_values()` which has been introduced since Spark 2.3. Note the type of the result column: ArrayType + from pyspark.sql.functions import map_values + result = df_map_nested.select(map_values("mapA")["type_name"], col("mapA.THREE.type_name")) + result.show(2,False) + result.printSchema() + +-----------------------------+------------+ + |map_values(mapA).type_name |type_name | + +-----------------------------+------------+ + |[Arabic number, English Text]|English Text| + +-----------------------------+------------+ + root + |-- map_values(mapA).type_name: array (nullable = true) + | |-- element: string (containsNull = true) + |-- type_name: string (nullable = true) +## Hurdles +The above steps would work well for most of dataframes. The only dataframes that it fails (as of Spark 2.4.3 or lower) are the ones with a StructType nested inside MORE THAN ONE layers of ArrayType. +Like this one: + df_nested_B = spark.createDataFrame([ + Row( + arrayA=[[ + Row(childStructB=Row(field1=1, field2="foo")), + Row(childStructB=Row(field1=2, field2="bar")) + ]] + )]) + df_nested_B.printSchema() + root + |-- arrayA: array (nullable = true) + | |-- element: array (containsNull = true) + | | |-- element: struct (containsNull = true) + | | | |-- childStructB: struct (nullable = true) + | | | | |-- field1: long (nullable = true) + | | | | |-- field2: string (nullable = true) +Or this one + df_nested_C = spark.createDataFrame([ + Row( + arrayA=[ + Row(childStructB=Row(childArrayC=[Row(field1=1, field2="foo")])), + Row(childStructB=Row(childArrayC=[Row(field1=2, field2="bar")])), + ] + )]) + df_nested_C.printSchema() + root + |-- arrayA: array (nullable = true) + | |-- element: struct (containsNull = true) + | | |-- childStructB: struct (nullable = true) + | | | |-- childArrayC: array (nullable = true) + | | | | |-- element: struct (containsNull = true) + | | | | | |-- field1: long (nullable = true) + | | | | | |-- field2: string (nullable = true) +Selecting `arrayA.childStructB.field1` from `df_nested_B` fails with the error message: `AnalysisException: No such struct field field1 in childStructB`.<br> +While selecting `arrayA.childStructB.childArrayC.field1` from `df_nested_C` throws the `AnalysisException`: `cannot resolve 'arrayA.childStructB.childArrayC['field1']' due to data type mismatch: argument 2 requires integral type, however, ''field1'' is of string type.` +## (More) Solutions +With the introduction of the SQL function `transform` in Spark 2.4, the error above can be solved by applying `transform` on every layer of the array. +A comprehensive implementation of a flatten function can be found in the Python package `sparkaid`: + from sparkaid import flatten + flatten(df_nested_B).printSchema() + root + |-- arrayA__childStructB_field1: array (nullable = true) + | |-- element: array (containsNull = true) + | | |-- element: long (containsNull = true) + |-- arrayA__childStructB_field2: array (nullable = true) + | |-- element: array (containsNull = true) + | | |-- element: string (containsNull = true) +<p> + flatten(df_nested_B).show() + +---------------------------+---------------------------+ + |arrayA__childStructB_field1|arrayA__childStructB_field2| + +---------------------------+---------------------------+ + | [[1, 2]]| [[foo, bar]]| + +---------------------------+---------------------------+ +<p> + flatten(df_nested_C).printSchema() + root + |-- arrayA_childStructB_childArrayC_field1: array (nullable = true) + | |-- element: array (containsNull = true) + | | |-- element: long (containsNull = true) + |-- arrayA_childStructB_childArrayC_field2: array (nullable = true) + | |-- element: array (containsNull = true) + | | |-- element: string (containsNull = true) +[]: https://spark.apache.org/docs/2.4.0/api/java/org/apache/spark/sql/functions.html#flatten-org.apache.spark.sql.Column + +%prep +%autosetup -n sparkaid-1.0.0 + +%build +%py3_build + +%install +%py3_install +install -d -m755 %{buildroot}/%{_pkgdocdir} +if [ -d doc ]; then cp -arf doc %{buildroot}/%{_pkgdocdir}; fi +if [ -d docs ]; then cp -arf docs %{buildroot}/%{_pkgdocdir}; fi +if [ -d example ]; then cp -arf example %{buildroot}/%{_pkgdocdir}; fi +if [ -d examples ]; then cp -arf examples %{buildroot}/%{_pkgdocdir}; fi +pushd %{buildroot} +if [ -d usr/lib ]; then + find usr/lib -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/lib64 ]; then + find usr/lib64 -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/bin ]; then + find usr/bin -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/sbin ]; then + find usr/sbin -type f -printf "/%h/%f\n" >> filelist.lst +fi +touch doclist.lst +if [ -d usr/share/man ]; then + find usr/share/man -type f -printf "/%h/%f.gz\n" >> doclist.lst +fi +popd +mv %{buildroot}/filelist.lst . +mv %{buildroot}/doclist.lst . + +%files -n python3-sparkaid -f filelist.lst +%dir %{python3_sitelib}/* + +%files help -f doclist.lst +%{_docdir}/* + +%changelog +* Mon Apr 10 2023 Python_Bot <Python_Bot@openeuler.org> - 1.0.0-1 +- Package Spec generated |
