-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
apache-spark.html
529 lines (464 loc) · 36.2 KB
/
apache-spark.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta content="width=device-width, initial-scale=1.0" name="viewport">
<title>Apache spark</title>
<meta content="" name="description">
<meta content="" name="keywords">
<!-- Favicons -->
<link href="assets/img/Favicon-1.png" rel="icon">
<link href="assets/img/Favicon-1.png" rel="apple-touch-icon">
<!-- Google Fonts -->
<link href="https://fonts.googleapis.com/css?family=Open+Sans:300,300i,400,400i,600,600i,700,700i|Raleway:300,300i,400,400i,500,500i,600,600i,700,700i|Poppins:300,300i,400,400i,500,500i,600,600i,700,700i" rel="stylesheet">
<!-- Vendor CSS Files -->
<link href="assets/vendor/aos/aos.css" rel="stylesheet">
<link href="assets/vendor/bootstrap/css/bootstrap.min.css" rel="stylesheet">
<link href="assets/vendor/bootstrap-icons/bootstrap-icons.css" rel="stylesheet">
<link href="assets/vendor/boxicons/css/boxicons.min.css" rel="stylesheet">
<link href="assets/vendor/glightbox/css/glightbox.min.css" rel="stylesheet">
<link href="assets/vendor/swiper/swiper-bundle.min.css" rel="stylesheet">
<!-- Creating a python code section-->
<link rel="stylesheet" href="assets/css/prism.css">
<script src="assets/js/prism.js"></script>
<!-- Template Main CSS File -->
<link href="assets/css/style.css" rel="stylesheet">
<!-- To set the icon, visit https://fontawesome.com/account-->
<script src="https://kit.fontawesome.com/5d25c1efd3.js" crossorigin="anonymous"></script>
<!-- end of icon-->
<script type="text/javascript" async
src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.7/MathJax.js?config=TeX-MML-AM_CHTML">
</script>
<!-- =======================================================
* Template Name: iPortfolio
* Updated: Sep 18 2023 with Bootstrap v5.3.2
* Template URL: https://bootstrapmade.com/iportfolio-bootstrap-portfolio-websites-template/
* Author: BootstrapMade.com
* License: https://bootstrapmade.com/license/
======================================================== -->
</head>
<body>
<!-- ======= Mobile nav toggle button ======= -->
<i class="bi bi-list mobile-nav-toggle d-xl-none"></i>
<!-- ======= Header ======= -->
<header id="header">
<div class="d-flex flex-column">
<div class="profile">
<img src="assets/img/myphoto.jpeg" alt="" class="img-fluid rounded-circle">
<h1 class="text-light"><a href="index.html">Arun</a></h1>
<div class="social-links mt-3 text-center">
<a href="https://www.linkedin.com/in/arunp77/" target="_blank" class="linkedin"><i class="bx bxl-linkedin"></i></a>
<a href="https://github.com/arunp77" target="_blank" class="github"><i class="bx bxl-github"></i></a>
<a href="https://twitter.com/arunp77_" target="_blank" class="twitter"><i class="bx bxl-twitter"></i></a>
<a href="https://www.instagram.com/arunp77/" target="_blank" class="instagram"><i class="bx bxl-instagram"></i></a>
<a href="https://arunp77.medium.com/" target="_blank" class="medium"><i class="bx bxl-medium"></i></a>
</div>
</div>
<nav id="navbar" class="nav-menu navbar">
<ul>
<li><a href="index.html#hero" class="nav-link scrollto active"><i class="bx bx-home"></i> <span>Home</span></a></li>
<li><a href="index.html#about" class="nav-link scrollto"><i class="bx bx-user"></i> <span>About</span></a></li>
<li><a href="index.html#resume" class="nav-link scrollto"><i class="bx bx-file-blank"></i> <span>Resume</span></a></li>
<li><a href="index.html#portfolio" class="nav-link scrollto"><i class="bx bx-book-content"></i> <span>Portfolio</span></a></li>
<li><a href="index.html#skills-and-tools" class="nav-link scrollto"><i class="bx bx-wrench"></i> <span>Skills and Tools</span></a></li>
<li><a href="index.html#language" class="nav-link scrollto"><i class="bi bi-menu-up"></i> <span>Languages</span></a></li>
<li><a href="index.html#awards" class="nav-link scrollto"><i class="bi bi-award-fill"></i> <span>Awards</span></a></li>
<li><a href="index.html#professionalcourses" class="nav-link scrollto"><i class="bx bx-book-alt"></i> <span>Professional Certification</span></a></li>
<li><a href="index.html#publications" class="nav-link scrollto"><i class="bx bx-news"></i> <span>Publications</span></a></li>
<!-- <li><a href="index.html#extra-curricular" class="nav-link scrollto"><i class="bx bx-rocket"></i> <span>Extra-Curricular Activities</span></a></li> -->
<!-- <li><a href="#contact" class="nav-link scrollto"><i class="bx bx-envelope"></i> <span>Contact</span></a></li> -->
</ul>
</nav><!-- .nav-menu -->
</div>
</header><!-- End Header -->
<main id="main">
<!-- ======= Breadcrumbs ======= -->
<section id="breadcrumbs" class="breadcrumbs">
<div class="container">
<div class="d-flex justify-content-between align-items-center">
<h2>Data Engineering</h2>
<ol>
<li><a href="Data-engineering.html" class="clickable-box">Content section</a></li>
<li><a href="index.html#portfolio" class="clickable-box">Portfolio section</a></li>
</ol>
</div>
</div>
</section><!-- End Breadcrumbs -->
<!------ right dropdown menue ------->
<div class="right-side-list">
<div class="dropdown">
<button class="dropbtn"><strong>Shortcuts:</strong></button>
<div class="dropdown-content">
<ul>
<li><a href="cloud-compute.html"><i class="fas fa-cloud"></i> Cloud</a></li>
<li><a href="AWS-GCP.html"><i class="fas fa-cloud"></i> AWS-GCP</a></li>
<li><a href="amazon-s3.html"><i class="fas fa-cloud"></i> AWS S3</a></li>
<li><a href="ec2-confi.html"><i class="fas fa-server"></i> EC2</a></li>
<li><a href="Docker-Container.html"><i class="fab fa-docker" style="color: rgb(29, 27, 27);"></i> Docker</a></li>
<li><a href="Jupyter-nifi.html"><i class="fab fa-python" style="color: rgb(34, 32, 32);"></i> Jupyter-nifi</a></li>
<li><a href="snowflake-task-stream.html"><i class="fas fa-snowflake"></i> Snowflake</a></li>
<li><a href="data-model.html"><i class="fas fa-database"></i> Data modeling</a></li>
<li><a href="sql-basics.html"><i class="fas fa-table"></i> QL</a></li>
<li><a href="sql-basic-details.html"><i class="fas fa-database"></i> SQL</a></li>
<li><a href="Bigquerry-sql.html"><i class="fas fa-database"></i> Bigquerry</a></li>
<li><a href="scd.html"><i class="fas fa-archive"></i> SCD</a></li>
<li><a href="sql-project.html"><i class="fas fa-database"></i> SQL project</a></li>
<!-- Add more subsections as needed -->
</ul>
</div>
</div>
</div>
<!-- ======= Portfolio Details Section ======= -->
<section id="portfolio-details" class="portfolio-details">
<div class="container">
<div class="row gy-4">
<h1>Apache spark: Big data processing</h1>
<div class="col-lg-8">
<div class="portfolio-details-slider swiper">
<div class="swiper-wrapper align-items-center">
<figure>
<img src="assets/img/data-engineering/Apache_Spark_logo.svg.png" alt="" style="max-width: 50%; max-height: auto;">
<figcaption></figcaption>
</figure>
</div>
<div class="swiper-pagination"></div>
</div>
</div>
<div class="col-lg-4 grey-box">
<div class="section-title">
<h3>Table of Contents</h3>
<ol>
<li><a href="#introduction">Introduction to Apache Spark</a></li>
<li><a href="#how-it-works">How does Spark work?</a></li>
<ul>
<li><a href="#difference">Difference between Hadoop and Apache Spark</a></li>
<li><a href="#key-concept">Some key concepts of Apache Spark</a></li>
<li><a href="#key-features">Key Features</a></li>
</ul>
<li><a href="#Components">Components of Apache Spark</a></li>
<ul>
<li><a href="#use-cases">Use Cases</a></li>
</ul>
<li><a href="#rdd">Resilient Distributed Datasets (RDD)</a></li>
<li><a href="#reference">Reference</a></li>
</ol>
</div>
</div>
</div>
<section>
<h3 id="introduction">Introduction to Apache Spark</h3>
<a href="https://spark.apache.org/" target="_blanck">Apache Spark™</a> is an open-source, distributed computing system for processing and analyzing large datasets.
It was initially developed in 2012 at the AMPLab at UC Berkeley
and later donated to the Apache Software Foundation.
<ul>
<li>Spark's primary feature is its ability to perform data processing in-memory, which makes it significantly faster
than Apache <a href="https://hadoop.apache.org/" target="_blanck">Hadoop®</a> for certain types of workloads. Spark supports various types of big data workloads,
including batch processing, interactive queries, streaming, machine
learning, and graph processing.</li>
<li>On the other hand, Apache <a href="https://hadoop.apache.org/" target="_blanck">Hadoop®</a> is a platform that was first developed in 2006 and is used for distributed
storage and processing of large datasets. <a href="https://hadoop.apache.org/" target="_blanck">Hadoop®</a> is built using
the MapReduce programming model, which is based on the idea of mapping and reducing data to process it in parallel across a distributed cluster of computers.
<a href="https://hadoop.apache.org/" target="_blanck">Hadoop®</a> is known
for its fault-tolerance, scalability, and cost-effectiveness.</li>
<li>Imagine a powerful engine built for speed and endurance. That's essentially what Spark is. It's an open-source, unified analytics engine designed to process massive
datasets efficiently and at lightning speed. It achieves this through its unique distributed architecture, where tasks are split across multiple machines, working
in parallel to crunch through your data.</li>
</ul>
<!--------------------->
<h3 id="how-it-works">How does Spark work?</h3>
<p>Spark's operational model adheres to the hierarchical primary-secondary principle, commonly known as the master-slave principle. The Spark driver functions as the master node, overseen by the
cluster manager, which, in turn, manages the slave nodes and directs data analyses to the client. The distribution and monitoring of executions and queries are facilitated through the SparkContext,
established by the Spark driver, collaborating with cluster managers like Spark, YARN, Hadoop, or Kubernetes. The Resilient Distributed Datasets (RDDs) play a pivotal role in this process.</p>
<p>Spark dynamically determines resource utilization for data querying or storage, deciding where queried data is directed. The engine's ability to process data directly in the memory of server
clusters dynamically reduces latency and ensures rapid performance. Additionally, the implementation of parallel work steps and the utilization of both virtual and physical memory contribute to its efficiency.</p>
<p>Apache Spark also processes data from various data stores . These include the Hadoop Distributed File System (HDFS) and relational data storage such as Hive or NoSQL databases. In addition, there is performance-enhancing in-memory or hard-disk processing - depending on how large the data sets in question are</p>
<div class="grey-box">
<strong>Hadoop: </strong> Hadoop is a framework written in Java that utilizes a large cluster of commodity hardware to maintain and store big size data.
Hadoop works on MapReduce Programming Algorithm that was introduced by Google. Today lots of Big Brand Companies are using Hadoop in their Organization
to deal with big data, eg. Facebook, Yahoo, Netflix, eBay, etc. The Hadoop Architecture Mainly consists of 4 components.
<ul>
<li>MapReduce</li>
<li>HDFS(Hadoop Distributed File System)</li>
<li>YARN(Yet Another Resource Negotiator)</li>
<li>Common Utilities or Hadoop Common</li>
</ul>
For more details, see <a href="hadoop.html">Hadoop</a>.
</div>
<br>
<!-------------------->
<h5 id="difference">Difference between Hadoop® and Apache Spark™</h5>
<ul>
<li><strong>Performance: </strong>Spark is generally faster than <a href="https://hadoop.apache.org/" target="_blanck">Hadoop®</a> for data processing tasks because
it performs computations in-memory, reducing the need to read and write data to disk. Spark can be up to 100 times faster than
<a href="https://hadoop.apache.org/" target="_blanck">Hadoop®</a> for certain workloads.</li>
<li><strong>Cost: </strong>Spark can be more expensive than <a href="https://hadoop.apache.org/" target="_blanck">Hadoop®</a> because it requires more memory to perform
in-memory computations. <a href="https://hadoop.apache.org/" target="_blanck">Hadoop®</a>, on the other hand, is known for its cost-effectiveness as it can run on commodity hardware.</li>
<li><strong>Machine Learning Algorithms: </strong>Spark provides built-in machine learning algorithms in its <a href="https://spark.apache.org/mllib/" target="_blanck">MLlib</a>
library, which can be used for regression, classification, clustering, and other machine learning tasks. <a href="https://hadoop.apache.org/" target="_blanck">Hadoop®</a>
does not have built-in machine learning libraries, but it can be integrated with other machine learning frameworks such as
<a href="https://mahout.apache.org//" target="_blanck">Apache Mahout</a>.</li>
<li><strong>Data Processing: </strong><a href="https://hadoop.apache.org/" target="_blanck">Hadoop®</a> is primarily used for batch processing, which involves processing large
datasets in job lots. Spark supports batch processing, but it also supports interactive queries, streaming, and graph processing.</li>
<li><strong>Programming Languages: </strong><a href="https://hadoop.apache.org/" target="_blanck">Hadoop®</a> is primarily written in Java, and its MapReduce programming model
is based on Java. Spark supports programming in Java, Scala, Python, and R, making it more accessible to a wider range of developers.</li>
</ul>
<!-------------------->
<h5 id="key-concept">Some key concepts of Apache Spark™</h5>
<ul>
<li><strong>Distributed Processing: </strong>Unlike traditional methods that process data on a single machine, Spark distributes tasks across multiple machines
(nodes) in a cluster. This allows it to handle massive datasets efficiently by dividing the work and processing it in parallel. Think of it as having many
chefs working on different parts of a large meal instead of just one.</li>
<li><strong>Resilience: </strong>Spark is designed to be fault-tolerant. If a node fails, the work is automatically reassigned to other nodes, ensuring your processing
continues uninterrupted. This is like having backup chefs ready to step in if someone gets sick.</li>
<li><strong>In-Memory Computing: </strong>Spark stores frequently accessed data in memory for faster processing compared to reading it from disk. This is like having
your ingredients readily available on the counter instead of searching the pantry every time you need something.</li>
<li><strong>Lineage (i.e. version): </strong>Spark keeps track of how data is transformed, allowing you to trace the origin and understand how results were obtained.
This is like having a recipe that shows you each step involved in creating the final dish.</li>
<li><strong>APIs: </strong>Spark provides APIs in various languages like Python, Java, Scala, and R. This allows you to choose the language you're most comfortable with
and write code in a familiar syntax. It's like having different tools (spatulas, whisks, etc.) that work with the same ingredients.</li>
<li><strong>Functional Programming: </strong>Spark heavily utilizes functional programming concepts like immutability and lazy evaluation. This makes code cleaner, easier
to reason about, and more resistant to errors. Think of it as following precise instructions without modifying the ingredients themselves.</li>
</ul>
<!-------------------->
<h5 id="key-features">Key Features</h5 >
Apache Spark is a significantly faster and more powerful engine than Apache Hadoop or Apache Hive . It processes jobs 100 times faster when processing occurs in memory and 10 times
faster when processing occurs on disk compared to Hadoop's MapReduce. Spark therefore offers companies cost-reducing and efficiency-increasing performance.
What is particularly interesting about Spark is its flexibility. This means that the engine can not only be run independently, but also in Hadoop clusters controlled via YARN.
It also enables developers to write applications for Spark in various programming languages . Not only SQL , but also Python, Scala, R or Java come into question.
Other special features of Spark: It does not have to be set up on the Hadoop file system, but can also be operated with other data platforms such as AWS S3, Apache Cassandra or HBase .
In addition, when specifying the data source, it processes both batch processes such as Hadoop as well as stream data and various workloads with almost identical code. Using an
interactive query process, current and historical real-time data, including analysis, can be distributed across multiple layers on the hard drive and memory and processed in parallel.
Summary of the key features are as follows:
<ul>
<li><strong>Speed: </strong>Spark performs in-memory processing, making it much faster than its predecessor, MapReduce. It achieves high performance through advanced DAG
(Directed Acyclic Graph) execution engine.</li>
<li><strong>Ease of Use: </strong>Spark supports multiple programming languages, including Scala, Java, Python, and R. Its high-level APIs make it easy to use for both
batch processing and real-time data processing.</li>
<li><strong>Versatility: </strong>Spark's ecosystem includes libraries for data analysis, machine learning (<a href="https://spark.apache.org/mllib/" target="_blanck">MLlib</a>),
graph processing (GraphX), and stream processing
(Structured Streaming). This versatility makes it a
comprehensive solution for various data-related tasks.</li>
<li><strong>Fault Tolerance: </strong>Spark provides fault tolerance through lineage information and recomputation, ensuring that tasks are re-executed in case of node failures.</li>
<li><strong>Compatibility: </strong>It can run on various cluster managers, such as Apache Mesos, <a href="https://hadoop.apache.org/" target="_blanck">Hadoop®</a> YARN, or Kubernetes,
making it compatible with different big data
environments.</li>
</ul>
<!--------------------->
<h3 id="Components">Components of Apache Spark™</h3>
Spark Core is the fundamental component of <a href="https://spark.apache.org/" target="_blanck">Apache Spark™</a>, serving as the project's foundation. It offers distributed task
dispatching, scheduling, and basic I/O functionalities.
Spark Core provides an application programming interface (API) for multiple programming languages like Java, Python, Scala, .NET, and R. This API is centered around the Resilient
Distributed Datasets (RDD) abstraction. The Java API, while primarily for JVM languages, can also be used with some non-JVM languages that can connect to the JVM, such as Julia.
<ul>
<li><strong>Spark Core: </strong>Spark Core is the underlying general execution engine for spark platform that all other functionality is built upon. It provides In-Memory computing and referencing datasets in external storage systems.
It provides the basic functionality of Spark, including task scheduling, memory management, and fault recovery.</li>
<li><strong>Spark SQL: </strong>Spark SQL is a component on top of Spark Core that introduces a new data abstraction called SchemaRDD, which provides support for structured and semi-structured data.
Allows for querying structured data using SQL as well as Spark's built-in DataFrame API.</li>
<li><strong>Spark Streaming: </strong>Spark Streaming leverages Spark Core's fast scheduling capability to perform streaming analytics. It ingests data in mini-batches and performs RDD (Resilient Distributed Datasets) transformations on those mini-batches of data.
Enables scalable, high-throughput, fault-tolerant stream processing of live data streams.</li>
<li><strong>Spark DataFrames: </strong>High-level data structures for manipulating data like a spreadsheet.</li>
<li><strong>Spark <a href="https://spark.apache.org/mllib/" target="_blanck">MLlib</a> (Machine Learning Library): </strong>A distributed machine learning framework for
building scalable and robust machine learning models. Spark MLlib is nine times as fast as the Hadoop disk-based version of Apache Mahout (before Mahout gained a Spark interface).</li>
<li><strong>Spark GraphX: </strong>GraphX is a distributed graph-processing framework on top of Spark. It provides an API for expressing graph computation that can model the user-defined graphs by using Pregel abstraction API. It also provides an optimized runtime for this abstraction.</li>
<li><strong>Spark Structured Streaming: </strong>Allows users to express streaming computations the same way as batch computations, providing unified processing for both.</li>
</ul>
<figure>
<img src="assets/img/data-engineering/apache-spark1.png" alt="" style="max-width: 50%; max-height: auto;">
<figcaption style="text-align: center;"><strong>Language support: </strong><a href="https://spark.apache.org/" target="_blanck">Apache Spark™</a> has built-in support
for Scala, Java, SQL, R, and Python with 3rd party support for
the .NET CLR, Julia, and more.</figcaption>
</figure>
<!-------------------->
<h4 id="use-cases">Use Cases</h4>
<ul>
<li><strong>Big Data Processing:</strong> Spark is widely used for processing and analyzing large datasets efficiently.</li>
<li><strong>Machine Learning:</strong> <a href="https://spark.apache.org/mllib/" target="_blanck">MLlib</a> simplifies the development of scalable machine learning applications.</li>
<li><strong>Graph Processing:</strong> GraphX enables the processing of large-scale graph data.</li>
<li><strong>Real-time Analytics:</strong> Spark Streaming and Structured Streaming support real-time analytics. </li>
</ul>
<!----------------------->
<h3 id="spark-Architecture">Spark Architecture</h3>
Apache Spark's architecture is designed for efficient and scalable processing of large datasets. It achieves this through two main pillars:
<ol>
<li><strong>Master-Slave Architecture: </strong>
<ul>
<li><strong>Master (Driver): </strong>Submits applications, coordinates task scheduling, and manages communication between components. Think of it as the conductor of an orchestra, overseeing the entire performance.</li>
<li><strong>Slaves (Executors): </strong>Worker nodes distributed across the cluster, carrying out the actual computations on partitioned data. Imagine them as the different sections of the orchestra, each playing their assigned parts.</li>
</ul>
</li>
<li><strong>Data Abstraction: </strong>
<ul>
<li><strong>Resilient Distributed Datasets (RDDs): </strong>Immutable, fault-tolerant collections of data distributed across the cluster. RDDs are the fundamental unit of data in Spark and can be created from various sources like files, databases, or other RDDs. Think of them as sheet music distributed to each section of the orchestra, ensuring everyone plays the same song simultaneously.</li>
</ul>
</li>
</ol>
Apache Spark follows a distributed architecture designed for scalability and fault tolerance. The key components in Spark's architecture include:
<ul>
<li><strong>Driver Program:</strong>
<ul>
<li>The entry point of any Spark application.</li>
<li>Contains the main function and creates a SparkContext to coordinate the execution of tasks.</li>
</ul>
</li>
<li><strong>Cluster Manager:</strong>
<ul>
<li>Manages resources across the cluster.</li>
<li>Common cluster managers include Apache Mesos, Hadoop YARN, and Spark's standalone cluster manager.</li>
</ul>
</li>
<li><strong>SparkContext:</strong>
<ul>
<li>Created by the driver program and coordinates the execution of tasks on the cluster.</li>
<li>Communicates with the cluster manager to acquire resources and manage the execution of tasks.</li>
</ul>
</li>
<li><strong>Distributed Data:</strong>
<ul>
<li>Data is distributed across the cluster in partitions.</li>
<li>Resilient Distributed Datasets (RDDs) or DataFrames represent distributed collections of data.</li>
</ul>
</li>
<li><strong>Executor:</strong>
<ul>
<li>Each worker node in the cluster has an executor.</li>
<li>Executes tasks assigned by the SparkContext and manages the data stored on that node.</li>
</ul>
</li>
<li><strong>Task:</strong>
<ul>
<li>The smallest unit of work in Spark.</li>
<li>Executed on an executor and performs operations on the partitions of the distributed data.</li>
</ul>
</li>
<li><strong>Job:</strong>
<ul>
<li>A collection of tasks that are submitted to Spark for execution.</li>
<li>Jobs are divided into stages based on the transformations and actions applied to the data.</li>
</ul>
</li>
<li><strong>Stage:</strong>
<ul>
<li>A set of tasks that can be executed in parallel without shuffling data between them.</li>
<li>Stages are determined by the transformations in the application.</li>
</ul>
</li>
<li><strong>RDD lineage and DAG:</strong>
<ul>
<li>RDD lineage records the sequence of transformations applied to construct an RDD.</li>
<li>Directed Acyclic Graph (DAG) is a logical representation of the sequence of stages and tasks.</li>
</ul>
</li>
<li><strong>Shuffling:</strong>
<ul>
<li>Occurs when data needs to be redistributed across the cluster, typically between stages.</li>
<li>Can be an expensive operation in terms of performance.</li>
</ul>
</li>
<li><strong>Broadcasting:</strong>
<ul>
<li>Efficiently sends read-only variables to all the worker nodes, reducing data transfer overhead.</li>
</ul>
</li>
</ul>
<figure>
<img src="assets/img/data-engineering/spark-architecture.png" alt="" style="max-width: 90%; max-height: auto;">
<figcaption style="text-align: center;">Generated through: <a href="https://mermaid.live/" target="_blank">mermaid.live</a> using graph LR code given below.</figcaption>
</figure>
<pre>
#!graph lr
graph LR
subgraph cluster_driver
Driver_Program --> SparkContext
end
subgraph cluster_spark
SparkContext --> Cluster_Manager
end
subgraph cluster_manager
Cluster_Manager --> |Allocates Resources| Executor_1
Cluster_Manager --> |Allocates Resources| Executor_2
Cluster_Manager --> |Allocates Resources| Executor_N
end
subgraph cluster_executor
Executor_1 --> Task_1
Executor_2 --> Task_2
Executor_N --> Task_N
end
subgraph cluster_task
Task_1 --> Distributed_Data_1
Task_2 --> Distributed_Data_2
Task_N --> Distributed_Data_N
end
</pre>
<!------------------------>
<h3 id="rdd">Resilient Distributed Datasets (RDD)</h3>
Resilient Distributed Datasets (RDD) is a fundamental data structure of Spark. It is an immutable distributed collection of objects. Each dataset in RDD is divided into logical partitions,
which may be computed on different nodes of the cluster. RDDs can contain any type of Python, Java, or Scala objects, including user-defined classes.
There are two ways to create RDDs:
<ul>
<li>parallelizing an existing collection in your driver program or</li>
<li> referencing a dataset in an external storage system, such as a shared file system, HDFS, HBase, or any data source offering a Hadoop Input Format.</li>
</ul>
<p>Data sharing is slow in MapReduce due to replication, serialization, and disk IO. Most of the Hadoop applications, they spend more than 90% of the time doing HDFS read-write operations.
Recognizing this problem, researchers developed a specialized framework called Apache Spark. The key idea of spark is Resilient Distributed Datasets (RDD); it supports in-memory
processing computation. This means, it stores the state of memory as an object across the jobs and the object is sharable between those jobs. Data sharing in memory is 10 to 100
times faster than network and Disk
</p>
</section>
<!-------Reference ------->
<section id="reference">
<h3>References</h3>
<ol>
<li><a href="https://spark.apache.org/docs/latest/" target="_blank"><a href="https://spark.apache.org/" target="_blanck">Apache Spark™</a> Official Documentation</a>.</li>
<li><a href="https://www.databricks.com/learn" target="_blank">Databricks Learning Academy</a>.</li>
<li><a href="https://sparkbyexamples.com/" target="_blank">Spark by Examples</a>.</li>
<li><a href="https://medium.com/@gkadam2011/beneath-rdd-resilient-distributed-dataset-in-apache-spark-260c0b7250c6" target="_blank">Beneath RDD(Resilient Distributed Dataset) in Apache Spark</a></li>
</ol>
</section>
<hr>
<div style="background-color: #f0f0f0; padding: 15px; border-radius: 5px;">
<h3>Some other interesting things to know:</h3>
<ul style="list-style-type: disc; margin-left: 30px;">
<li>Visit my website on <a href="sql-project.html">For Data, Big Data, Data-modeling, Datawarehouse, SQL, cloud-compute.</a></li>
<li>Visit my website on <a href="Data-engineering.html">Data engineering</a></li>
</ul>
</div>
<p></p>
<div class="navigation">
<a href="index.html#portfolio" class="clickable-box">
<span class="arrow-left">Portfolio section</span>
</a>
<a href="Data-engineering.html" class="clickable-box">
<span class="arrow-right">Content</span>
</a>
</div>
</div>
</section><!-- End Portfolio Details Section -->
</main><!-- End #main --
<!-- ======= Footer ======= -->
<footer id="footer">
<div class="container">
<div class="copyright">
© Copyright <strong><span>Arun</span></strong>
</div>
</div>
</footer><!-- End Footer -->
<a href="#" class="back-to-top d-flex align-items-center justify-content-center"><i class="bi bi-arrow-up-short"></i></a>
<!-- Vendor JS Files -->
<script src="assets/vendor/purecounter/purecounter_vanilla.js"></script>
<script src="assets/vendor/aos/aos.js"></script>
<script src="assets/vendor/bootstrap/js/bootstrap.bundle.min.js"></script>
<script src="assets/vendor/glightbox/js/glightbox.min.js"></script>
<script src="assets/vendor/isotope-layout/isotope.pkgd.min.js"></script>
<script src="assets/vendor/swiper/swiper-bundle.min.js"></script>
<script src="assets/vendor/typed.js/typed.umd.js"></script>
<script src="assets/vendor/waypoints/noframework.waypoints.js"></script>
<script src="assets/vendor/php-email-form/validate.js"></script>
<!-- Template Main JS File -->
<script src="assets/js/main.js"></script>
<script>
document.addEventListener("DOMContentLoaded", function () {
hljs.initHighlightingOnLoad();
});
</script>
</body>
</html>