import findspark
findspark.init()

from pyspark import SparkContext


# sc : 파이썬 shell에서 spark library를 사용하기 위한 객체 
sc = SparkContext()


# parallelize() : 어떤 형태의 collection data를 RDD로 변환하는 함수, file로 불러오는건 X
a = sc.parallelize(range(1, 100)) # 아직 spark는 어떤 작업도 하지 않음 (Lazy execution)
print("sc type : ", sc)
print("RDD collct() : ", a.collect()) # job 실행, 이제야 spark execution
print("the number of partitions : ", a.getNumPartitions()) # 현재 partition 개수, Cluster를 사용하고 있지 않기 때문에 현재 CPU의 core 개수일 것.

sc type :  <SparkContext master=local[*] appName=pyspark-shell>
RDD collct() :  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]
the number of partitions :  12


a = sc.parallelize(range(1, 100000), 3) # parallelize시 partition 개수 지정 가능
print("the number of partitions : ", a.getNumPartitions())

the number of partitions :  3


b = a.repartition(6) # repartition으로 partition 개수를 변경하여 새로운 RDD 생성 가능
print("the number of partitions : ", b.getNumPartitions())

the number of partitions :  6


c = b.coalesce(1) # coalesce로 partition 개수를 '감소' 시킬 수 있음(repartition으로 감소하는 것 보다 효율적으로)
print("the number of partitions : ", c.getNumPartitions())

the number of partitions :  1


sc.stop()


import os
print(os.getcwd())

/home/user


path = "/home/user/data/"


sc = SparkContext()
spark_wiki = sc.textFile(path + "README.md") # 파일의 모든 line을 요소로 가지는 RDD


spark_wiki

/home/user/data/README.md MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0


spark_wiki.count() # total number of lines

13


spark_wiki.take(5) # take는 action, 5 lines 반환

['### SPARK EXAMPLE',
 '1. SPARK RDD',
 '2. SPARK DATAFRAME',
 '',
 "Apache Spark is an open-source unified analytics engine for large-scale data processing. Spark provides an interface for programming entire clusters with implicit data parallelism and fault tolerance. Originally developed at the University of California, Berkeley's AMPLab, the Spark codebase was later donated to the Apache Software Foundation, which has maintained it since."]


spark_wiki.first() # action : first line

'### SPARK EXAMPLE'


a = spark_wiki.map(lambda line: line.split(" ")) # partition을 유지하면서 mapping


b = spark_wiki.flatMap(lambda line: line.split(" ")) # 모든 요소를 차원 축소하여 map


a.take(5)

[['###', 'SPARK', 'EXAMPLE'],
 ['1.', 'SPARK', 'RDD'],
 ['2.', 'SPARK', 'DATAFRAME'],
 [''],
 ['Apache',
  'Spark',
  'is',
  'an',
  'open-source',
  'unified',
  'analytics',
  'engine',
  'for',
  'large-scale',
  'data',
  'processing.',
  'Spark',
  'provides',
  'an',
  'interface',
  'for',
  'programming',
  'entire',
  'clusters',
  'with',
  'implicit',
  'data',
  'parallelism',
  'and',
  'fault',
  'tolerance.',
  'Originally',
  'developed',
  'at',
  'the',
  'University',
  'of',
  'California,',
  "Berkeley's",
  'AMPLab,',
  'the',
  'Spark',
  'codebase',
  'was',
  'later',
  'donated',
  'to',
  'the',
  'Apache',
  'Software',
  'Foundation,',
  'which',
  'has',
  'maintained',
  'it',
  'since.']]


b.take(5)

['###', 'SPARK', 'EXAMPLE', '1.', 'SPARK']


wordcount = spark_wiki.flatMap(lambda line: line.split(" ")).\
                        map(lambda x: (x, 1)).\
                        reduceByKey(lambda x,y: x+ y)
wordcount.collect()

[('###', 1),
 ('SPARK', 3),
 ('EXAMPLE', 1),
 ('1.', 1),
 ('', 5),
 ('Apache', 7),
 ('Spark', 14),
 ('is', 7),
 ('an', 3),
 ('open-source', 1),
 ('unified', 1),
 ('analytics', 1),
 ('engine', 1),
 ('provides', 1),
 ('programming', 2),
 ('entire', 1),
 ('clusters', 1),
 ('implicit', 1),
 ('fault', 1),
 ('developed', 2),
 ('at', 1),
 ('University', 1),
 ('of', 13),
 ("Berkeley's", 1),
 ('AMPLab,', 1),
 ('codebase', 1),
 ('was', 3),
 ('in', 7),
 ('read-only', 1),
 ('multiset', 1),
 ('items', 1),
 ('fault-tolerant', 1),
 ('way.[2]', 1),
 ('The', 3),
 ('Dataframe', 1),
 ('as', 3),
 ('RDD,', 1),
 ('1.x,', 1),
 ('primary', 1),
 ('(API),', 1),
 ('but', 1),
 ('use', 2),
 ('even', 1),
 ('though', 1),
 ('deprecated.[4][5]', 1),
 ('technology', 1),
 ('API.[6][7]', 1),
 ('2012', 1),
 ('response', 1),
 ('limitations', 1),
 ('MapReduce', 3),
 ('forces', 1),
 ('particular', 1),
 ('programs', 2),
 ('read', 1),
 ('input', 1),
 ('reduce', 1),
 ('results', 2),
 ('map,', 1),
 ('store', 1),
 ("Spark's", 1),
 ('working', 1),
 ('set', 2),
 ('offers', 1),
 ('form', 1),
 ('memory.[8]', 1),
 ('facilitates', 1),
 ('both', 1),
 ('iterative', 2),
 ('visit', 1),
 ('multiple', 1),
 ('loop,', 1),
 ('database-style', 1),
 ('querying', 1),
 ('data.', 1),
 ('latency', 1),
 ('may', 1),
 ('several', 1),
 ('orders', 1),
 ('magnitude', 1),
 ('compared', 1),
 ('Among', 1),
 ('class', 1),
 ('algorithms', 2),
 ('are', 1),
 ('training', 1),
 ('machine', 3),
 ('learning', 1),
 ('systems,', 1),
 ('formed', 1),
 ('initial', 1),
 ('impetus', 1),
 ('developing', 1),
 ('Spark.[10]', 1),
 ('manager', 1),
 ('storage', 2),
 ('supports', 2),
 ('where', 2),
 ('scripts', 1),
 ('install', 1),
 ('package.', 1),
 ('It', 1),
 ('possible', 1),
 ('run', 2),
 ('these', 1),
 ('single', 2),
 ('YARN,', 1),
 ('Mesos', 1),
 ('[11]', 1),
 ('storage,', 1),
 ('variety,', 1),
 ('Alluxio,', 1),
 ('Distributed', 1),
 ('System', 2),
 ('Kudu,', 1),
 ('solution', 1),
 ('pseudo-distributed', 1),
 ('used', 2),
 ('only', 1),
 ('development', 1),
 ('purposes,', 1),
 ('scenario,', 1),
 ('executor', 1),
 ('CPU', 1),
 ('core.', 1),
 ('RDD', 4),
 ('2.', 1),
 ('DATAFRAME', 1),
 ('for', 7),
 ('large-scale', 1),
 ('data', 6),
 ('processing.', 1),
 ('interface', 3),
 ('with', 3),
 ('parallelism', 1),
 ('and', 6),
 ('tolerance.', 1),
 ('Originally', 1),
 ('the', 23),
 ('California,', 1),
 ('later', 1),
 ('donated', 1),
 ('to', 4),
 ('Software', 1),
 ('Foundation,', 1),
 ('which', 4),
 ('has', 2),
 ('maintained', 2),
 ('it', 1),
 ('since.', 1),
 ('its', 2),
 ('architectural', 1),
 ('foundation', 1),
 ('resilient', 1),
 ('distributed', 8),
 ('dataset', 1),
 ('(RDD),', 1),
 ('a', 17),
 ('over', 1),
 ('cluster', 5),
 ('machines,', 1),
 ('that', 2),
 ('API', 3),
 ('released', 1),
 ('abstraction', 1),
 ('on', 5),
 ('top', 1),
 ('followed', 1),
 ('by', 3),
 ('Dataset', 3),
 ('API.', 1),
 ('In', 1),
 ('application', 1),
 ('2.x', 1),
 ('encouraged[3]', 1),
 ('not', 2),
 ('still', 1),
 ('underlies', 1),
 ('RDDs', 2),
 ('were', 1),
 ('computing', 1),
 ('paradigm,', 1),
 ('linear', 1),
 ('dataflow', 1),
 ('structure', 1),
 ('programs:', 1),
 ('from', 1),
 ('disk,', 1),
 ('map', 1),
 ('function', 2),
 ('across', 1),
 ('data,', 1),
 ('reduction', 1),
 ('disk.', 1),
 ('(deliberately)', 1),
 ('restricted', 1),
 ('shared', 1),
 ('implementation', 1),
 ('algorithms,', 1),
 ('their', 1),
 ('times', 1),
 ('interactive/exploratory', 1),
 ('analysis,', 1),
 ('i.e.,', 1),
 ('repeated', 1),
 ('such', 2),
 ('applications', 1),
 ('be', 3),
 ('reduced', 1),
 ('Hadoop', 3),
 ('implementation.[2][9]', 1),
 ('requires', 1),
 ('system.', 1),
 ('For', 2),
 ('management,', 1),
 ('standalone', 1),
 ('(native', 1),
 ('cluster,', 1),
 ('you', 1),
 ('can', 4),
 ('launch', 2),
 ('either', 1),
 ('manually', 1),
 ('or', 4),
 ('provided', 1),
 ('also', 2),
 ('daemons', 1),
 ('testing),', 1),
 ('Kubernetes.', 1),
 ('wide', 1),
 ('including', 1),
 ('File', 2),
 ('(HDFS),[12]', 1),
 ('MapR', 1),
 ('(MapR-FS),[13]', 1),
 ('Cassandra,[14]', 1),
 ('OpenStack', 1),
 ('Swift,', 1),
 ('Amazon', 1),
 ('S3,', 1),
 ('Lustre', 1),
 ('file', 2),
 ('system,[15]', 1),
 ('custom', 1),
 ('implemented.', 1),
 ('local', 2),
 ('mode,', 1),
 ('usually', 1),
 ('testing', 1),
 ('required', 1),
 ('system', 1),
 ('instead;', 1),
 ('one', 1),
 ('per', 1)]

아파트 가격 추이 분석 서비스(2) (0)	2021.05.22
아파트 가격 추이 분석 서비스(1) (0)	2021.05.22
[YouTube]Apache Spark Tutorial Full Course - Job, Stage, Task (3) (0)	2021.05.12
[YouTube]Apache Spark Tutorial Full Course - RDD (2) (0)	2021.05.10
[YouTube]Apache Spark Tutorial Full Course - Intro, spark, cluster (1) (0)	2021.05.01

작심삼일

[YouTube]Apache Spark Tutorial Full Course - RDD Creation (4)

PySpark using Jupyter Notebook

RDD 생성

Practice

RDD 생성(from data collections)¶

RDD 생성 (from file)¶

wordcount¶

'IT study > Big Data' 카테고리의 다른 글

+ Recent posts

티스토리툴바