Introduction to Apache Spark (Bigdata) in Tamil – ஸ்பார்க் ஒரு அறிமுகம்

குறிப்புகளும், நிரல்களும் இங்கே.


wget redrockdigimark.com/apachemirror/spark/spark-2.3.1/spark-2.3.1-bin-hadoop2.7.tgz
tar -xzvf spark-2.3.1-bin-hadoop2.7.tgz
sudo mv spark-2.3.1-bin-hadoop2.7 /usr/local/bigdata
sudo mv spark-2.3.1-bin-hadoop2.7 spark
export SPARK_HOME=/usr/local/bigdata/spark
export PATH=$PATH:/usr/local/bigdata/spark/bin
rdd1 = sc.textFile ("file:///home/shrini/women.txt")
rdd1.collect()
rdd1.count()
rdd1.distinct().count()
rdd1.countByValue()
rdd1.first()
rdd1.take(3)
rdd1.takeSample(False,4,1)
rdd1.sample(True,0.5,3).collect()
rdd1.map(lambda i : i.split(" ")).take(3)
rdd1.flatMap(lambda i : i.split(" ")).take(3)
rdd1.filter(lambda i : ("Chennai" in i)).count()
rdd1.getNumPartitions()
sudo mv log4j.properties.template log4j.properties
/usr/local/bigdata/spark/bin/spark-submit ~/counting.py
/usr/local/bigdata/spark/bin/spark-submit ~/populations.py

view raw

spark-notes.txt

hosted with ❤ by GitHub

 


001,Nithya,Duraisamy,31,Manager,9586452156,Hyderabad
002,Nandhini,Babu,28,Assistant Manager,9848022338,Delhi
003,Madhuri,Nathan,51,VP,9848022339,Delhi
004,Kavitha,Manoharan,45,AVP,9848022330,Hyderabad
005,Vijaya,Kandasamy,45,AVP,9848022336,Noida
006,Aarthi,Raj,28,Assistant Manager,9848022335,Chennai
007,Lavanya,Sankar,23,Senior Engineer,9848022334,Chennai
008,Meena,Baskar,51,VP,9848022333,Hyderabad
009,Gayathri,Ragu,22,Engineer,9848022333,Chennai
010,Thenmozhi,Rajan,45,AVP,9848022336,Noida
010,Thenmozhi,Rajan,45,AVP,9848022336,Noida

view raw

women.txt

hosted with ❤ by GitHub

 


from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local").setAppName("WordCount")
sc = SparkContext(conf = conf)
rdd1 = sc.textFile("file:///home/shrini/smp.csv")
def cols(data):
sno,fname,lname,age,desig,mob,location = data.split(",")
return sno,fname,lname,age,desig,mob,location
dict1 = rdd1.countByValue()
dict2 = rdd1.map(cols).filter(lambda line: int(line[3])>=30).countByValue()
managers=0
for i,j in dict1.items():
if "Manager" in i:
managers = managers+j
seniors=0
for j in dict2.values():
seniors = seniors+j
print("Total No. of records:",str(rdd1.count()))
print("Distinct records:",str(rdd1.distinct().count()))
print("Toal No.of Managers:",str(managers))
print("No. of Seniors (age>30):",str(seniors))

view raw

counting.py

hosted with ❤ by GitHub


from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import col
from matplotlib import pyplot
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.types import Row
from pyspark.ml.regression import LinearRegression
sc = SparkContext('local')
spark = SparkSession(sc)
df1 = spark.read.option('header','true')\
.option('inferSchema','true')\
.csv('file:///home/shrini/born_babies.csv')
print (df1)
print (df1.columns)
print (df1.toPandas())
df2 = df1.groupBy('yr')\
.agg({'male' : 'sum', 'female' : 'sum'})\
.select(col('yr'), (col('sum(male)')+col('sum(female)')).alias('populations'))\
.orderBy('yr')
print (df2.toPandas())
pyplot.plot(df2.toPandas().yr, df2.toPandas().populations)
pyplot.xlabel('Year')
pyplot.ylabel('No. of babies')
pyplot.title('Population includes new born male and female babies')
pyplot.annotate('local max', xy=(2001, .0), xytext=(2002, 1.1), fontsize = 12,arrowprops=dict(facecolor='grey', shrink=0.05, linewidth = 2))
pyplot.show()
train = VectorAssembler(inputCols=['yr'], outputCol = 'features').transform(df2)\
.withColumn('year',df2.yr)\
.withColumn('label',df2.populations)
print (train.toPandas())
i = VectorAssembler(inputCols=['yr'], outputCol = 'features').transform(sc.parallelize(train.select('yr').rdd.map(lambda x: x[0]).collect()+[2019, 2020, 2021, 2022, 2023]).map(Row('yr')).toDF())
model = LinearRegression(maxIter=10).fit(train).transform(i).toPandas()
print (model)
pyplot.plot(model.yr,model.prediction)
pyplot.plot(train.select('yr').rdd.map(lambda x: x[0]).collect(), train.select('populations').rdd.map(lambda x: x[0]).collect())
pyplot.legend(loc = 4)
pyplot.title('Prediction on future population')
pyplot.show()

view raw

populations.py

hosted with ❤ by GitHub

%d bloggers like this: