Dedup logic in Spark SQL or Hive:
select
*
from (select
*
,(row_number() over (partition by user_id order by mts_trckng_rowkey)) as alias_1
from DB_NAME.TABLE_NAME
where dt = '20191025'
) alias_2
WHERE alias_2.alias_1 = 1;
select
*
from (select
*
,(row_number() over (partition by user_id order by mts_trckng_rowkey)) as alias_1
from DB_NAME.TABLE_NAME
where dt = '20191025'
) alias_2
WHERE alias_2.alias_1 = 1;
Dedup logic in Scala:
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions._
// Dedup logic : Remove duplicate records by eventid and DataSourceKey combination
val Data_DF_Final_Dedup = Data_DF_Final.withColumn("ROWNUM", row_number().over(Window.partitionBy(col("Id"), col("DataSourceKey")).orderBy($"Updateddate".desc))).filter("ROWNUM = 1").drop("ROWNUM")
display(Data_DF_Final_Dedup)
ADB Spark SQL
%sql
select * from
(
select eventid,
row_number() OVER (PARTITION BY Id ORDER BY Updateddate DESC) alias1
from parquet.`abfss://xy@abc.dfs.core.windows.net/dataproducts/test/v1/sen/full`
where id = 115894
) alias2
where alias2.alias1=1
No comments:
Post a Comment