-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathsequential.py
32 lines (23 loc) · 876 Bytes
/
sequential.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# Generate a sequential number for each row within each group, ordered by date.
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import row_number
from pyspark.sql.window import Window
# Initialize Spark session
spark = SparkSession.builder.appName("RowNumberPerGroup").getOrCreate()
# Sample data
group_data = [
Row(GroupID='A', Date='2023-01-01'),
Row(GroupID='A', Date='2023-01-02'),
Row(GroupID='B', Date='2023-01-01'),
Row(GroupID='B', Date='2023-01-03')
]
# Create DataFrame
df_group = spark.createDataFrame(group_data)
# Convert Date to date type
df_group = df_group.withColumn("Date", col("Date").cast("date"))
# Define window spec
windowSpec = Window.partitionBy("GroupID").orderBy("Date")
# Generate sequential number
df_group = df_group.withColumn("SeqNum", row_number().over(windowSpec))
# Show results
df_group.show()