import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import *
# Create a Spark session
spark = SparkSession.builder.appName("GooglePlayStore").getOrCreate()
# Read the CSV file
df = spark.read.csv('googleplaystore.csv', header=True, inferSchema=True, escape='"')
df.sample(False, 0.1).show(5, truncate=True)
+--------------------+-----------------+------+-------+----+--------+----+-----+--------------+---------------+----------------+-----------+-----------+ | App| Category|Rating|Reviews|Size|Installs|Type|Price|Content Rating| Genres| Last Updated|Current Ver|Android Ver| +--------------------+-----------------+------+-------+----+--------+----+-----+--------------+---------------+----------------+-----------+-----------+ |Photo Designer - ...| ART_AND_DESIGN| 4.7| 3632|5.5M|500,000+|Free| 0| Everyone| Art & Design| July 31, 2018| 3.1| 4.1 and up| | Art Drawing Ideas| ART_AND_DESIGN| 4.1| 227|5.2M| 50,000+|Free| 0| Everyone| Art & Design| May 31, 2018| 1.2| 2.3 and up| | I Creative Idea| ART_AND_DESIGN| 4.7| 353|4.2M| 10,000+|Free| 0| Teen| Art & Design| April 27, 2018| 1.6| 4.1 and up| |Gas Prices (Germa...|AUTO_AND_VEHICLES| 4.4| 805|5.6M| 50,000+|Free| 0| Everyone|Auto & Vehicles| July 29, 2018| 2.5.1| 4.4 and up| | Used Cars Mexico|AUTO_AND_VEHICLES| 4.0| 190|2.5M| 50,000+|Free| 0| Everyone|Auto & Vehicles|October 14, 2016| 1.0| 2.3 and up| +--------------------+-----------------+------+-------+----+--------+----+-----+--------------+---------------+----------------+-----------+-----------+ only showing top 5 rows
df.printSchema()
root |-- App: string (nullable = true) |-- Category: string (nullable = true) |-- Rating: double (nullable = true) |-- Reviews: string (nullable = true) |-- Size: string (nullable = true) |-- Installs: string (nullable = true) |-- Type: string (nullable = true) |-- Price: string (nullable = true) |-- Content Rating: string (nullable = true) |-- Genres: string (nullable = true) |-- Last Updated: string (nullable = true) |-- Current Ver: string (nullable = true) |-- Android Ver: string (nullable = true)
for col in df.columns:
counts = df.groupBy(col).agg(count("*").alias("count")).orderBy("count", ascending=False).limit(5)
print(f"df['{col}']")
counts.show(truncate=False)
print("\n")
df['App'] +-------------------------------------------------+-----+ |App |count| +-------------------------------------------------+-----+ |ROBLOX |9 | |CBS Sports App - Scores, News, Stats & Watch Live|8 | |ESPN |7 | |8 Ball Pool |7 | |Candy Crush Saga |7 | +-------------------------------------------------+-----+ df['Category'] +--------+-----+ |Category|count| +--------+-----+ |FAMILY |1972 | |GAME |1144 | |TOOLS |843 | |MEDICAL |463 | |BUSINESS|460 | +--------+-----+ df['Rating'] +------+-----+ |Rating|count| +------+-----+ |NaN |1474 | |4.4 |1109 | |4.3 |1076 | |4.5 |1038 | |4.2 |952 | +------+-----+ df['Reviews'] +-------+-----+ |Reviews|count| +-------+-----+ |0 |596 | |1 |272 | |2 |214 | |3 |175 | |4 |137 | +-------+-----+ df['Size'] +------------------+-----+ |Size |count| +------------------+-----+ |Varies with device|1695 | |11M |198 | |12M |196 | |14M |194 | |13M |191 | +------------------+-----+ df['Installs'] +-----------+-----+ |Installs |count| +-----------+-----+ |1,000,000+ |1579 | |10,000,000+|1252 | |100,000+ |1169 | |10,000+ |1054 | |1,000+ |907 | +-----------+-----+ df['Type'] +----+-----+ |Type|count| +----+-----+ |Free|10039| |Paid|800 | |0 |1 | |NaN |1 | +----+-----+ df['Price'] +-----+-----+ |Price|count| +-----+-----+ |0 |10040| |$0.99|148 | |$2.99|129 | |$1.99|73 | |$4.99|72 | +-----+-----+ df['Content Rating'] +---------------+-----+ |Content Rating |count| +---------------+-----+ |Everyone |8714 | |Teen |1208 | |Mature 17+ |499 | |Everyone 10+ |414 | |Adults only 18+|3 | +---------------+-----+ df['Genres'] +-------------+-----+ |Genres |count| +-------------+-----+ |Tools |842 | |Entertainment|623 | |Education |549 | |Medical |463 | |Business |460 | +-------------+-----+ df['Last Updated'] +--------------+-----+ |Last Updated |count| +--------------+-----+ |August 3, 2018|326 | |August 2, 2018|304 | |July 31, 2018 |294 | |August 1, 2018|285 | |July 30, 2018 |211 | +--------------+-----+ df['Current Ver'] +------------------+-----+ |Current Ver |count| +------------------+-----+ |Varies with device|1459 | |1.0 |809 | |1.1 |264 | |1.2 |178 | |2.0 |151 | +------------------+-----+ df['Android Ver'] +------------------+-----+ |Android Ver |count| +------------------+-----+ |4.1 and up |2451 | |4.0.3 and up |1501 | |4.0 and up |1375 | |Varies with device|1362 | |4.4 and up |980 | +------------------+-----+
From The Result above, we can summary the dataframe.
The dataset represents information about mobile apps available on the Google Play Store. It contains various attributes for each app, such as its name, category, rating, number of reviews, size, installation count, type (free or paid), price, content rating, genre, last update date, current version, and Android version compatibility.
App (string): The name of the mobile app.
Category (string): The category to which the app belongs. Top 3 categories are:
Rating (double): The average user rating of the app. Top 3 ratings are:
Reviews (string): The number of user reviews for the app.
Size (string): The size of the app. Note that some entries indicate "Varies with device."
Installs (string): The number of app installations. Top 3 installation counts are:
Type (string): Whether the app is free or paid. Top 3 types are:
Price (string): The price of the app. Some entries are in the format "$X.XX."
Content Rating (string): The content rating for the app. Top 3 content ratings are:
Genres (string): The genre(s) of the app. Top 3 genres are:
Last Updated (string): The date when the app was last updated.
Current Ver (string): The current version of the app. Note that some entries indicate "Varies with device."
Android Ver (string): The minimum Android version required to run the app.
df = df.drop("Size", "Content Rating", "Last Updated", "Current Ver", "Android Ver")
df.sample(False, 0.1).show(5)
+--------------------+--------------+------+-------+----------+----+-----+------------+ | App| Category|Rating|Reviews| Installs|Type|Price| Genres| +--------------------+--------------+------+-------+----------+----+-----+------------+ |U Launcher Lite –...|ART_AND_DESIGN| 4.7| 87510|5,000,000+|Free| 0|Art & Design| |Smoke Effect Phot...|ART_AND_DESIGN| 3.8| 178| 50,000+|Free| 0|Art & Design| | Easy Origami Ideas|ART_AND_DESIGN| 4.2| 1015| 100,000+|Free| 0|Art & Design| |How to draw Ladyb...|ART_AND_DESIGN| 3.8| 564| 100,000+|Free| 0|Art & Design| |How To Color Disn...|ART_AND_DESIGN| 4.0| 591| 500,000+|Free| 0|Art & Design| +--------------------+--------------+------+-------+----------+----+-----+------------+ only showing top 5 rows
df.printSchema()
root |-- App: string (nullable = true) |-- Category: string (nullable = true) |-- Rating: double (nullable = true) |-- Reviews: string (nullable = true) |-- Installs: string (nullable = true) |-- Type: string (nullable = true) |-- Price: string (nullable = true) |-- Genres: string (nullable = true)
We are required to convert the data types of the 'Reviews,' 'Installs,' and 'Price' columns from string to integer.
To enhance the data quality, we are required to convert the data types of the Reviews
, Installs
, and Price
columns from string to integer. Special attention is needed for the Installs
and Price
columns to handle additional characters present in the data.
# Convert "Reviews" column to integer
df = df.withColumn("Reviews", col("Reviews").cast(IntegerType()))
# Clean and convert "Installs" column to integer in a single step
df = df.withColumn("Installs", regexp_replace(col("Installs"), "[^0-9]", "").cast(IntegerType()))
# Clean and convert "Price" column to integer in a single step
df = df.withColumn("Price", regexp_replace(col("Price"), "[^0-9]", "").cast(IntegerType()))
# Group by 'App' and sum the 'Reviews' for each app
apps_review_sum = df.groupBy('App').agg(sum('Reviews').alias('Total Reviews'))
# Sort the DataFrame by 'Total Reviews' in descending order
top_5_apps_by_reviews = apps_review_sum.orderBy(col('Total Reviews').desc()).limit(5)
# Show the top 5 apps with the most sum of reviews
top_5_apps_by_reviews.show(truncate=False)
+----------------------------------------+-------------+ |App |Total Reviews| +----------------------------------------+-------------+ |Instagram |266241989 | |WhatsApp Messenger |207348304 | |Clash of Clans |179558781 | |Messenger – Text and Video Chat for Free|169932272 | |Subway Surfers |166331958 | +----------------------------------------+-------------+
# Group by 'App' and sum the 'Installs' for each app
apps_installs_sum = df.groupBy('App').agg(sum('Installs').alias('Total Installs'))
# Sort the DataFrame by 'Total Installs' in descending order
top_5_apps_by_installs = apps_installs_sum.orderBy(col('Total Installs').desc()).limit(5)
# Show the top 5 apps with the most sum of instaklls
top_5_apps_by_installs.show(truncate=False)
+--------------+--------------+ |App |Total Installs| +--------------+--------------+ |Subway Surfers|6000000000 | |Google Photos |4000000000 | |Hangouts |4000000000 | |Instagram |4000000000 | |Google Drive |4000000000 | +--------------+--------------+
# Group by 'Category' and sum the 'Installs' for each app
categories_installs_sum = df.groupBy('Category').agg(sum('Installs').alias('Total Installs'))
# Sort the DataFrame by 'Total Installs' in descending order
categories_installs_sum = categories_installs_sum.orderBy(col('Total Installs').desc())
# Show the category wise distribution of installed apps
categories_installs_sum.show(truncate=False)
+-------------------+--------------+ |Category |Total Installs| +-------------------+--------------+ |GAME |35086024415 | |COMMUNICATION |32647276251 | |PRODUCTIVITY |14176091369 | |SOCIAL |14069867902 | |TOOLS |11452771915 | |FAMILY |10258263505 | |PHOTOGRAPHY |10088247655 | |NEWS_AND_MAGAZINES |7496317760 | |TRAVEL_AND_LOCAL |6868887146 | |VIDEO_PLAYERS |6222002720 | |SHOPPING |3247848785 | |ENTERTAINMENT |2869160000 | |PERSONALIZATION |2325494782 | |BOOKS_AND_REFERENCE|1921469576 | |SPORTS |1751174498 | |HEALTH_AND_FITNESS |1583072512 | |BUSINESS |1001914865 | |FINANCE |876648734 | |EDUCATION |871452000 | |MAPS_AND_NAVIGATION|724281890 | +-------------------+--------------+ only showing top 20 rows
# Filtering the Rating Column From the invalid values
top_apps_by_ratings = df.filter(~isnan(col('Rating'))).filter(df['Rating'] <= 5)
# Sort the DataFrame by 'Rating' in descending order
top_apps_by_ratings = top_apps_by_ratings.orderBy(col('Rating').desc()).limit(5)
# Show Top 5 Apps with the Highest Ratings
top_apps_by_ratings.select('App', 'Rating', 'Type').show(truncate=False)
+------------------------------------------+------+----+ |App |Rating|Type| +------------------------------------------+------+----+ |Awake Dating |5.0 |Free| |Speeding Joyride & Car Meet App |5.0 |Free| |Spine- The dating app |5.0 |Free| |Hojiboy Tojiboyev Life Hacks |5.0 |Free| |Girls Live Talk - Free Text and Video Chat|5.0 |Free| +------------------------------------------+------+----+
# Sort the DataFrame by 'Price' in descending order
top_apps_by_price = df.orderBy(col('Price').desc()).limit(5)
# Show Top 5 Apps with the Highest Price
top_apps_by_price.select('App', 'Price').show(truncate=False)
+------------------------+-----+ |App |Price| +------------------------+-----+ |I'm Rich - Trump Edition|40000| |💎 I'm rich |39999| |I am Rich Plus |39999| |I am rich |39999| |I Am Rich Premium |39999| +------------------------+-----+
# Filtering the Rating Column From the invalid values and Free Apps
top_paid_apps_by_ratings = df.filter(~isnan(col('Rating'))).filter(df['Rating'] <= 5).filter(df['Type']=='Paid')
# Sort the DataFrame by 'Rating' in descending order
top_paid_apps_by_ratings = top_paid_apps_by_ratings.orderBy(col('Rating').desc()).limit(5)
# Show Top 5 Apps with the Highest Ratings
top_paid_apps_by_ratings.select('App', 'Rating', 'Type').show(truncate=False)
+-------------------------+------+----+ |App |Rating|Type| +-------------------------+------+----+ |Android P Style Icon Pack|5.0 |Paid| |AJ Gray Dark Icon Pack |5.0 |Paid| |Easy Hotspot Ad Free |5.0 |Paid| |P-Home for KLWP |5.0 |Paid| |Hey AJ! It's Saturday! |5.0 |Paid| +-------------------------+------+----+