குறிப்புகளும், நிரல்களும் இங்கே.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
wget redrockdigimark.com/apachemirror/spark/spark-2.3.1/spark-2.3.1-bin-hadoop2.7.tgz | |
tar -xzvf spark-2.3.1-bin-hadoop2.7.tgz | |
sudo mv spark-2.3.1-bin-hadoop2.7 /usr/local/bigdata | |
sudo mv spark-2.3.1-bin-hadoop2.7 spark | |
export SPARK_HOME=/usr/local/bigdata/spark | |
export PATH=$PATH:/usr/local/bigdata/spark/bin | |
rdd1 = sc.textFile ("file:///home/shrini/women.txt") | |
rdd1.collect() | |
rdd1.count() | |
rdd1.distinct().count() | |
rdd1.countByValue() | |
rdd1.first() | |
rdd1.take(3) | |
rdd1.takeSample(False,4,1) | |
rdd1.sample(True,0.5,3).collect() | |
rdd1.map(lambda i : i.split(" ")).take(3) | |
rdd1.flatMap(lambda i : i.split(" ")).take(3) | |
rdd1.filter(lambda i : ("Chennai" in i)).count() | |
rdd1.getNumPartitions() | |
sudo mv log4j.properties.template log4j.properties | |
/usr/local/bigdata/spark/bin/spark-submit ~/counting.py | |
/usr/local/bigdata/spark/bin/spark-submit ~/populations.py | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
001,Nithya,Duraisamy,31,Manager,9586452156,Hyderabad | |
002,Nandhini,Babu,28,Assistant Manager,9848022338,Delhi | |
003,Madhuri,Nathan,51,VP,9848022339,Delhi | |
004,Kavitha,Manoharan,45,AVP,9848022330,Hyderabad | |
005,Vijaya,Kandasamy,45,AVP,9848022336,Noida | |
006,Aarthi,Raj,28,Assistant Manager,9848022335,Chennai | |
007,Lavanya,Sankar,23,Senior Engineer,9848022334,Chennai | |
008,Meena,Baskar,51,VP,9848022333,Hyderabad | |
009,Gayathri,Ragu,22,Engineer,9848022333,Chennai | |
010,Thenmozhi,Rajan,45,AVP,9848022336,Noida | |
010,Thenmozhi,Rajan,45,AVP,9848022336,Noida |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pyspark import SparkConf, SparkContext | |
conf = SparkConf().setMaster("local").setAppName("WordCount") | |
sc = SparkContext(conf = conf) | |
rdd1 = sc.textFile("file:///home/shrini/smp.csv") | |
def cols(data): | |
sno,fname,lname,age,desig,mob,location = data.split(",") | |
return sno,fname,lname,age,desig,mob,location | |
dict1 = rdd1.countByValue() | |
dict2 = rdd1.map(cols).filter(lambda line: int(line[3])>=30).countByValue() | |
managers=0 | |
for i,j in dict1.items(): | |
if "Manager" in i: | |
managers = managers+j | |
seniors=0 | |
for j in dict2.values(): | |
seniors = seniors+j | |
print("Total No. of records:",str(rdd1.count())) | |
print("Distinct records:",str(rdd1.distinct().count())) | |
print("Toal No.of Managers:",str(managers)) | |
print("No. of Seniors (age>30):",str(seniors)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pyspark.context import SparkContext | |
from pyspark.sql.session import SparkSession | |
from pyspark.sql.functions import col | |
from matplotlib import pyplot | |
from pyspark.ml.feature import VectorAssembler | |
from pyspark.sql.types import Row | |
from pyspark.ml.regression import LinearRegression | |
sc = SparkContext('local') | |
spark = SparkSession(sc) | |
df1 = spark.read.option('header','true')\ | |
.option('inferSchema','true')\ | |
.csv('file:///home/shrini/born_babies.csv') | |
print (df1) | |
print (df1.columns) | |
print (df1.toPandas()) | |
df2 = df1.groupBy('yr')\ | |
.agg({'male' : 'sum', 'female' : 'sum'})\ | |
.select(col('yr'), (col('sum(male)')+col('sum(female)')).alias('populations'))\ | |
.orderBy('yr') | |
print (df2.toPandas()) | |
pyplot.plot(df2.toPandas().yr, df2.toPandas().populations) | |
pyplot.xlabel('Year') | |
pyplot.ylabel('No. of babies') | |
pyplot.title('Population includes new born male and female babies') | |
pyplot.annotate('local max', xy=(2001, .0), xytext=(2002, 1.1), fontsize = 12,arrowprops=dict(facecolor='grey', shrink=0.05, linewidth = 2)) | |
pyplot.show() | |
train = VectorAssembler(inputCols=['yr'], outputCol = 'features').transform(df2)\ | |
.withColumn('year',df2.yr)\ | |
.withColumn('label',df2.populations) | |
print (train.toPandas()) | |
i = VectorAssembler(inputCols=['yr'], outputCol = 'features').transform(sc.parallelize(train.select('yr').rdd.map(lambda x: x[0]).collect()+[2019, 2020, 2021, 2022, 2023]).map(Row('yr')).toDF()) | |
model = LinearRegression(maxIter=10).fit(train).transform(i).toPandas() | |
print (model) | |
pyplot.plot(model.yr,model.prediction) | |
pyplot.plot(train.select('yr').rdd.map(lambda x: x[0]).collect(), train.select('populations').rdd.map(lambda x: x[0]).collect()) | |
pyplot.legend(loc = 4) | |
pyplot.title('Prediction on future population') | |
pyplot.show() |