from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession

if __name__ == '__main__':
    sc = SparkContext('local')
    spark = SparkSession(sc)

    with open('data2_2.txt', 'r') as f:
        lines = f.readlines()
    
    lines_3_by_3 = [(lines[i], lines[i+1], lines[i+2]) for i in range(0, len(lines)-2)]

    rdd = sc.parallelize(lines_3_by_3)
    print(rdd.collect())

    rdd = rdd.map(lambda t: tuple(s.strip().split(', ') for s in t))
    print(rdd.collect())

    rdd = rdd.map(lambda t: set(t[0]) & set(t[1]) & set(t[2]))
    print(rdd.collect())

    rdd = rdd.flatMap(lambda s: list(s))
    print(rdd.collect())