Practical 1: Write a program in Map Reduce for WordCount operation.
[Link] ( Create a [Link] )
import [Link];
import [Link];
import [Link];
import [Link];
import [Link];
import [Link];
import [Link];
import [Link];
import [Link];
import [Link];
import [Link];
public class WordCount {
public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable>{
private Text word = new Text();
public void map(Object key, Text value, Context context )
throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer([Link]());
while ([Link]()) {
[Link]([Link]());
[Link](word, new IntWritable(1));
(1,1)
}
}
}
public static class IntSumReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
public void reduce(Text key, Iterable<IntWritable> values,Context context) throws
IOException, InterruptedException {
int sum = 0;
for (IntWritable x : values) { sum += [Link]();
}
[Link](key, new IntWritable(sum));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = [Link](conf, "word count");
[Link]([Link]); // mentioning Main class
[Link]([Link]);
[Link]([Link]);
[Link]([Link]);
[Link]([Link]);
[Link]([Link]);
[Link](job, new Path(args[0]));
[Link](job, new Path(args[1]));
[Link]([Link](true) ? 0 : 1);
}
}
[Link](input data)
how are you where are you
Steps to run program :-
start hadoop
[Link]
[Link]
hadoop [Link] [Link]
ls -l
hdfs dfs -ls /
hdfs dfs -rm -r /wordcount
jar cf [Link] WordCount*.class
[Link] localhost la check kara file ahe ka tikde.
hdfs dfs -mkdir -p /wordcount/input
hdfs dfs -copyFromLocal [Link] /wordcount/input
hadoop jar [Link] WordCount /wordcount/input /wordcount/output
hdfs dfs -cat /wordcount/output/part-r-00000
Practical 2: Write a program in Map Reduce for Matrix Multiplication.
[Link]
import [Link].*; import [Link]; import
[Link].*;
import [Link].*;
import [Link];
import [Link];
import [Link];
import [Link];
public class MatrixMultiply {
public static void main(String[] args) throws Exception {
if ([Link] != 2) { [Link]("Usage: MatrixMultiply <in_dir> <out_dir>");
[Link](2);
}
Configuration conf = new Configuration();
[Link]("n", "100");
[Link]("p", "1000");
@SuppressWarnings("deprecation")
Job job = new Job(conf, "MatrixMultiply");
[Link]([Link]);
[Link]([Link]);
[Link]([Link]);
[Link]([Link]); [Link]([Link]);
[Link]([Link]);
[Link]([Link]);
[Link](job, new Path(args[0]));
[Link](job, new Path(args[1]));
[Link](true);
}
}
[Link]
import [Link].*;
import [Link];
import [Link];
import [Link];
import [Link];
public class Map
extends [Link]<LongWritable, Text, Text, Text> {
@Override
public void map(LongWritable key, Text value, Context context) throws IOException,
InterruptedException {
Configuration conf = [Link]();
int m = [Link]([Link]("m"));
int p = [Link]([Link]("p"));
String line = [Link]();
String[] indicesAndValue = [Link](",");
Text outputKey = new Text();
Text outputValue = new Text();
if (indicesAndValue[0].equals("M")) {
for (int k = 0; k < p; k++) {
[Link](indicesAndValue[1] + "," + k);
[Link](indicesAndValue[0] + "," + indicesAndValue[2] + "," +
indicesAndValue[3]);
[Link](outputKey, outputValue);
}
} else {
for (int i = 0; i < m; i++) {
[Link](i + "," + indicesAndValue[2]);
[Link]("N," + indicesAndValue[1] + "," + indicesAndValue[3]);
[Link](outputKey, outputValue);
}
}
}
}
[Link]
import [Link];
import [Link];
import [Link];
import [Link];
public class Reduce
extends [Link]<Text, Text, Text, Text> {
@Override
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException,
InterruptedException {
String[] value;
HashMap<Integer, Float> hashA = new HashMap<Integer, Float>();
HashMap<Integer, Float> hashB = new HashMap<Integer, Float>();
for (Text val : values) {
value = [Link]().split(",");
if (value[0].equals("M")) {
[Link]([Link](value[1]), [Link](value[2]));
} else {
[Link]([Link](value[1]), [Link](value[2]));
}
}
int n = [Link]([Link]().get("n"));
float result = 0.0f;
float m_ij;
float n_jk;
for (int j = 0; j < n; j++) {
m_ij = [Link](j) ? [Link](j) : 0.0f;
n_jk = [Link](j) ? [Link](j) : 0.0f;
result += m_ij * n_jk;
}
if (result != 0.0f) {
[Link](null,
new Text([Link]() + "," + [Link](result)));
}
}
}
Ek file banva [Link] nava chi tyat he taka
M,0,0,12
M,0,1,13
M,1,0,14
M,1,1,15
Ek file banva [Link] nava chi tyat he taka
N,0,0,11
N,0,1,13
N,1,0,14
N,1,1,19
code:-
[Link]
[Link]
hadoop [Link] [Link] [Link] [Link]
jar cf [Link] *.class
ls –l
hdfs dfs -mkdir /MatrixMultiply
hdfs dfs -mkdir /MatrixMultiply/input
hdfs dfs -ls /
hdfs dfs -copyFromLocal [Link] [Link] /MatrixMultiply/input
hadoop jar [Link] MatrixMultiply /MatrixMultiply/input /MatrixMultiply/output
hdfs dfs -cat /MatrixMultiply/output/part-r-00000
MONGODB
CmdPrompt1-type : mongod
2nd cmd PmptType: mongosh
Practical 2: Sample Database Creation
Start cmd -> mongod
Start a new cmd -> mongosh
show dbs
use tanvi ( can use any name )
Practical 3: Query the Sample Database using MongoDB querying commands
[Link]("student")
[Link]({name: "Tanvi Tawade", rollno:61, div:"A"})
[Link]([{name: "Namrata Gaikwad", rollno:12, div: "B"},
{name: "Omkar Daifale", rollno:10, div:"A"},
{name: "Chinmay Warang", rollno:69, div:"A"},
{name: "Shreya Nikam", rollno:33, div:"B"},
{name: "Pratiksha Majrekar", rollno:31, div:"A"}, (ekach snippet code ahe )
{name: "Heth Shah", rollno:52, div:"B"},
{name: "Ketan Bhoir", rollno:6, div:"B"},
{name: "Uday Gavada", rollno:16, div:"A"},
{name: "Prathmesh Patil", rollno:38, div:"B"},
{name: "Swaraj Wadkar", rollno:67, div:"A"}])
[Link]({})
[Link]().pretty()
[Link]({name:"Tanvi Tawade"})
[Link]({name: {$in:["Tanvi Tawade", "Swaraj Wadkar"]}})
[Link]({$and:[{name:"Tanvi Tawade"},{rollno:61}]})
[Link]({$or:[{name:"Tanvi Tawade"},{rollno:31}]})
[Link]({rollno:{$lt:62}, $or:[{name:"Tanvi Tawade"},{div:"A"}] })
[Link]({rollno:{$lt:62}, $or:[{name:"Tanvi Tawade"},{div:"B"}] })
[Link]({$or:[{name:/^C/},{name:/^T/}]})
[Link]({$nor:[{name:"Swaraj Wadkar"},{div:"B"}]})
[Link]({name:"Heth Shah"})
[Link]({name:"Heth Shah"},{$set: {div:"A"}})
[Link]([{name: "Namrata Gaikwad", rollno:12, div: "B"},
{name: "Omkar Daifale", rollno:10, div:"A"},
{name: "Shreya Nikam", rollno:33, div:"B"}])
[Link]({div:"B"},{$set:{div:"A"}})
[Link]({name:"Namrata Gaikwad"},{$set:{div:"B",rollno:13} })
[Link]({rollno:38})
[Link]({$or: [{rollno:{$lt:30}},{div:"B"}]})
[Link]({name:1, rollno:1},{name: "idx_name_rollno"})
HIVE
Practical 3: Create Database & Table in Hive
To start hive Go to /home/hadoop/apache-hive-3.1.2-bin
[Link]
[Link]
hive
create database tanvi;
show databases;
use tanvi;
create table student(rno int, name string,section string, marks int);
show tables;
insert into table student values(61,'Tanvi', 'A', 83);
select * from student;
insert into table student values(12, 'Namrata', 'B', 54), (10,'Omkar','A',53),
(31,'Pratiksha','A',89),(33,'Shreya','B',23),(6,'Ketan','B',47),(69,'Chinmay','B',59),
(16,'Uday','A',78),(52,'Heth','B',68),(38,'Prathmesh','B',48), (67,'Swaraj','A',56);
Practical 4: Hive Partitioning
set [Link]=true;
set [Link];
set [Link]=nonstrict;
Create a file [Link]
61,Tanvi,A,83
12,Namrata,B,54
10,Omkar,A,53
31,Pratiksha,A,89
33,Shreya,B,23
6,Ketan,B,47
69,Chinmay,B,59
16,Uday,A,78
52,Heth,B,68
38,Prathmesh,B,48
67,Swaraj,A,56
create table student_part(rno int, name string,marks int)
partitioned by(section string)
row format delimited fields terminated by ',' ;
LOAD DATA LOCAL INPATH '/home/hadoop/hive/[Link]' INTO TABLE
student_part;
DESCRIBE FORMATTED student_part;
SELECT COUNT(*) FROM student_part WHERE section = 'A';
Practical 7: Hive Views and Indexes
CREATE VIEW emp_view AS SELECT * FROM employee WHERE salary>60000;
select * from emp_view;
drop view emp_view;
Practical 8: HiveQL : Select Where, Select OrderBy, Select GroupBy, Select Joins
Create a text file [Link]
61,Tanvi,Manager,83000
12,Namrata,Developer,54000
10,Omkar,Tester,53000
31,Pratiksha,Manager,89000
33,Shreya,Developer,23000
6,Ketan,Tester,47000
69,Chinmay,B,59000
16,Uday,Tester,78000
52,Heth,Developer,68000
38,Prathmesh,Developer,48000
67,Swaraj,Tester,56000
CREATE TABLE [Link] ( empcode INT,ename STRING, job STRING, salary
INT)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
LOAD DATA LOCAL INPATH '/home/hadoop/hive/[Link]' INTO TABLE employee;
select * from employee;
select count(*) from employee;
select avg(salary) from employee;
ALTER TABLE employee RENAME TO emp;
Create a [Link]
61,Tanvi,1,83000
12,Namrata,3,54000
10,Omkar,2,53000
31,Pratiksha,1,89000
33,Shreya,2,23000
6,Ketan,2,47000
69,Chinmay,3,59000
16,Uday,2,78000
52,Heth,3,68000
38,Prathmesh,3,48000
67,Swaraj,2,56000
37,Rupali,2,66000
CREATE TABLE [Link] ( empcode INT,ename STRING, dno INT,salary INT)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
LOAD DATA LOCAL INPATH '/home/hadoop/hive/[Link]' INTO TABLE employee;
select * from employee;
Create a dept..txt
1,Manager,Mumbai
2,Tester,Pune
3,Developer,Delhi
CREATE TABLE [Link] ( dno INT, dname STRING, loacation STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ',';
LOAD DATA LOCAL INPATH '/home/hadoop/hive/[Link]' INTO TABLE department;
select * from department;
select * from employee e, department d where [Link]=[Link];
select count(*) from employee group by dno;
select count(*) from employee e ,department d where [Link]=[Link] and [Link]='Manager';
ALTER TABLE employee ADD COLUMNS (dept STRING COMMENT 'Department
name');
SELECT dno, COUNT(*) FROM [Link] GROUP BY dno;
SELECT [Link],[Link],[Link],[Link],[Link] FROM employee e JOIN
department d ON ([Link]=[Link]);
SELECT [Link],[Link],[Link],[Link],[Link] FROM employee e LEFT OUTER
JOIN department d ON ([Link]=[Link]);
SELECT [Link],[Link],[Link],[Link],[Link] FROM employee e RIGHT OUTER
JOIN department d ON ([Link]=[Link]);
SELECT [Link],[Link],[Link],[Link] FROM employee e FULL OUTER JOIN
department d ON ([Link]=[Link]);
PIG
Practical 2: Pig Latin Basic
1. Display total number of students
Create a [Link]
61, Tanvi Tawade, maths, 85
61, Tanvi Tawade, aiml, 90
61, Tanvi Tawade, dscc, 78
12, Namrata Gaikwad, maths, 75
12, Namrata Gaikwad, aiml, 82
12, Namrata Gaikwad, dscc, 90
10, Omkar Daifale, maths, 92
10, Omkar Daifale, aiml, 88
10, Omkar Daifale, dscc, 76
69, Chinmay Warang, maths, 80
69, Chinmay Warang, aiml, 85
69, Chinmay Warang, dscc, 92
33, Shreya Nikam, maths, 88
33, Shreya Nikam, aiml, 78
33, Shreya Nikam, dscc, 85
31, Pratiksha Majrekar, maths, 76
31, Pratiksha Majrekar, aiml, 90
31, Pratiksha Majrekar, dscc, 82
52, Heth Shah, maths, 90
52, Heth Shah, aiml, 85
52, Heth Shah, dscc, 88
6, Ketan Bhoir, maths, 82
6, Ketan Bhoir, aiml, 76
6, Ketan Bhoir, dscc, 90
16, Uday Gavada, maths, 85
16, Uday Gavada, aiml, 92
16, Uday Gavada, dscc, 78
38, Prathmesh Patil, maths, 78
38, Prathmesh Patil, aiml, 85
38, Prathmesh Patil, dscc, 90
67, Swaraj Wadkar, maths, 92
67, Swaraj Wadkar, aiml, 80
67, Swaraj Wadkar, dscc, 86
stud = LOAD '[Link]' using PigStorage(',') AS (rno: int , name : chararray , sub :
chararray, mark : int);
dump stud;
Describe stud;
{rno: int , name : chararray , sub : chararray, mark : int}
A = group stud all;
dump A;
B = foreach A generate COUNT(stud);
dump B;
2. Display subject wise student count
A = group stud by sub;
dump A;
B = foreach A generate COUNT(stud);
dump B;
B = foreach A generate AVG([Link]);
dump B;
B = foreach A generate [Link], AVG([Link]);
dump B;
B = foreach A generate [Link], AVG([Link]);
dump B;
B = foreach A generate [Link], SUM([Link]);
dump B;
B = foreach A generate [Link], SUM([Link]);
dump B;
B = foreach A generate [Link], SUM([Link]);
dump B;
B = foreach A generate [Link], MAX([Link]);
dump B;
B = foreach A generate MAX([Link]);
dump B;
B = foreach A generate [Link], MIN([Link]);
dump B;
Practical 4: Download the data
pig -x local
Create a [Link]
61, Tanvi, Tawade, 22, 9766543210, Mumbai
12, Namrata, Gaikwad, 23, 9876543210, Mumbai
10, Omkar, Daifale, 22, 8765432109, Bangalore
69, Chinmay, Warang, 24, 7654321098, Delhi
33, Shreya, Nikam, 21, 6543210987, Mumbai
31, Pratiksha, Majrekar, 25, 5432109876, Hyderabad
52, Heth, Shah, 23, 4321098765, Bangalore
6, Ketan, Bhoir, 24, 3210987654, Mumbai
16, Uday, Gavada, 22, 2109876543, Delhi
38, Prathmesh, Patil, 21, 1098765432, Hyderabad
67, Swaraj, Wadkar, 25, 9876543210, Chennai
student1 = LOAD '[Link]' using PigStorage(',') AS (rno:chararray, fname:chararray,
lname:chararray, age:int, phone:int, city:chararray);
dump student1;
STORE student1 into 'student_output.txt' using PigStorage('|');
Practical 5: Create your Script
1. Write the following pig latin commands in a file called student_data.pig.
emp = load '[Link]' using PigStorage(',') AS (eid:chararray, name:chararray,
designation:chararray, deptid:chararray, salary:int);
STORE emp into 'emp_output.txt' using PigStorage(',');
ss = FOREACH emp GENERATE eid, name, deptid;
dump ss;
Practical 6: Save and Execute the Script
2. Execute the Apache Pig script using the following command.
pig -x local emp_data.pig
exec [Link]
run emp_data.pig
Practical 7: Pig Operations : Diagnostic Operators, Grouping and Joining, Combining
& Splitting, Filtering, Sorting
Crreate a [Link]
61, Tanvi, Tawade, 22, 9766543210, Mumbai
12, Namrata, Gaikwad, 23, 9876543210, Mumbai
10, Omkar, Daifale, 22, 8765432109, Bangalore
69, Chinmay, Warang, 24, 7654321098, Delhi
33, Shreya, Nikam, 21, 6543210987, Mumbai
31, Pratiksha, Majrekar, 25, 5432109876, Hyderabad
52, Heth, Shah, 23, 4321098765, Bangalore
6, Ketan, Bhoir, 24, 3210987654, Mumbai
16, Uday, Gavada, 22, 2109876543, Delhi
38, Prathmesh, Patil, 21, 1098765432, Hyderabad
67, Swaraj, Wadkar, 25, 9876543210, Chennai
student1 = LOAD '[Link]' using PigStorage(',') AS (rno:chararray, fname:chararray,
lname:chararray, age:int, phone:int, city:chararray);
a. Diagnostic Operators
dump student1;
describe student1;
explain student1;
stud_11 = FILTER student1 BY age < 23;
dump stud_11;
C = FOREACH student1 GENERATE rno, fname, city;
dump C;
illustrate C;
b. Grouping and Joining
stud_1 = GROUP student1 BY city;
dump stud_1;
describe stud_1;
stud_2 = GROUP student1 BY (city,age);
dump stud_2;
describe stud_2;
1. self join
A = LOAD '[Link]' using PigStorage(',') AS (rno:chararray, fname:chararray,
lname:chararray, age:int, phone:int, city:chararray);
B = LOAD '[Link]' using PigStorage(',') AS (rno:chararray, fname:chararray,
lname:chararray, age:int, phone:int, city:chararray);
C = JOIN A BY age, B BY age;
dump C;
2. Inner Join (equijoin)- An inner join returns rows when there is a match in both
tables.
Create a [Link]
61, Tanvi, Manager, 1, 83000
12, Namrata, Quality Assurance, 3, 54000
10, Omkar, Engineering, 2, 53000
31, Pratiksha, Manager, 1, 89000
33, Shreya, Testing, 2, 23000
6, Ketan, Testing, 2, 47000
69, Chinmay, Quality Assurance, 3, 59000
16, Uday, Testing, 2, 78000
52, Heth, Quality Assurance, 3, 68000
38, Prathmesh, Quality Assurance, 3, 48000
67, Swaraj, Testing, 2, 56000
37, Rupali, Testing, 2, 66000
emp= LOAD '[Link]' using PigStorage(',') AS (eid:chararray, name:chararray,
designation:chararray, deptid:chararray, salary:int);
dump emp;
Create a [Link]
1, Finance
2, Testing
3, Quality Assurance
dept= LOAD '[Link]' using PigStorage(',') AS (deptid:chararray, dname:chararray);
dump dept;
emp_dept_innerjoin = JOIN emp BY deptid, dept BY deptid;
dump emp_dept_innerjoin;
3. LEFT join
emp_dept_left = JOIN emp BY deptid LEFT, dept BY deptid;
dump emp_dept_left ;
4. RIGHT JOIN
emp_dept_right = JOIN emp BY deptid RIGHT, dept BY deptid;
dump emp_dept_right;
5. FULL outer join
emp_dept_full= JOIN emp BY deptid FULL OUTER, dept BY deptid;
dump emp_dept_full ;
Cross Product
cross_prod = CROSS emp, dept;
dump cross_prod;
c. Combining & Splitting
SPLIT emp into sal1 if salary<54000, sal2 if salary>=54000;
dump sal1;
dump sal2;
d. Filtering, Sorting
filter_designation = FILTER emp BY designation == 'manager';
dump filter_designation;
Order by
S = order emp by name desc;
dump S;
S = order emp by name asc;
dump S;
SPARK
Practical 2: Downloading Data Set and Processing it Spark
spark-shell
val mydfT = [Link]("/home/hadoop/SparkT/[Link]")
[Link]()
[Link]
[Link]("BVIMIT")
val mydf2 = [Link]("SELECT * FROM BVIMIT")
[Link]()
val mydf2 = [Link]("describe BVIMIT")
[Link]
val mydf2 = [Link]("SELECT * FROM BVIMIT where _c1 > 50")
[Link]
Step 1 : Create dataframe from json file
val df=[Link]("/home/hadoop/SparkT/[Link]")
if error is showwing then use this
val df1=[Link]("multiline","true").json("/home/hadoop/SparkT/[Link]")
[Link]()
[Link]()
[Link]("name").show()
[Link](("name"),("div")).show()
OR
[Link]([Link]("name"), [Link]("div")).show()
[Link]([Link]("rollno") >50).show()
[Link]("div").count().show()
[Link]("people2")
val sqlDF1 = [Link]("SELECT * FROM people2")
[Link]
[Link]("output")
Practical 3: Word Count in Apache Spark.
Create a [Link]
As we all know, a paragraph is a group of sentences that are connected and make absolute
sense. While writing a long essay or letter, we break them into paragraphs for better
understanding and to make a well-structured writing piece.
val data3=[Link]("/home/hadoop/SparkT/[Link]")
[Link]
val splitdata=[Link](line=>[Link](" "));
[Link];
val mapdata=[Link](word=>(word,1));
[Link]
val reducedata=[Link](_+_);
[Link]