FIFA Rankings Analysis with PySpark
In this PySpark-based analysis, we explore historical FIFA rankings. The dataset includes monthly ranking data for countries like Argentina (ARG), Brazil (BRA), Espanol (ESP), France(FRA), Germany (GER) and Italy (ITA).
Our analysis focuses on two main insights:
Top N Countries for a Selected Month and Year:
Users can input a specific year and month to see which countries had the best FIFA rankings during that period. User can also provide how many top countries is he interested in seeing.
Countries That Reached Rank #1:
We identify all instances where a country achieved the #1 FIFA rank, along with the corresponding month and year.
# import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, stack, desc, count, min, max, avg, year, month
from pyspark.sql.types import IntegerType
# Start Spark session
spark = SparkSession.builder.appName("FIFA Ranking Analysis").getOrCreate()
# Read the CSV file
df = spark.read.option("header", True).option("inferSchema", True).csv("C:\\Sarika\\Assignments\\fifa.csv")
# Convert the columns ARG, BRA, ESP, FRA, GER, and ITA in the DataFrame df to integers
df = df.withColumn("ARG", col("ARG").cast(IntegerType()))\
.withColumn("BRA", col("BRA").cast(IntegerType()))\
.withColumn("ESP", col("ESP").cast(IntegerType()))\
.withColumn("FRA", col("FRA").cast(IntegerType()))\
.withColumn("GER", col("GER").cast(IntegerType()))\
.withColumn("ITA", col("ITA").cast(IntegerType()))
# List of countries
countries = ['ARG', 'BRA', 'ESP', 'FRA', 'GER', 'ITA']
# Create stack expression to convert multiple country columns into two columns: Country and Rank.
stack_expr = "stack(, {1}) as (Country, Rank)".format(
len(countries),
', '.join([f"'{c}', {c}" for c in countries])
)
# Reshape to long format
rankings_reshaped = df.selectExpr("Date", stack_expr)
# Top N Countries for a given Month and Year
# Prompt user to enter values for year and month
input_year = int(input("Enter year (e.g., 2024): "))
input_month = int(input("Enter month (1–12): "))
top_n = int(input("Enter number of top countries to return: "))
# Print input values for clarity in blog
print(f"Selected Year: {input_year}")
print(f"Selected Month: {input_month}")
print(f"Top {top_n} Countries by FIFA Ranking:")
top_countries = (
rankings_reshaped.filter((year(col("date")) == input_year) & (month(col("date")) == input_month))
.groupBy("country")\
.agg(min("rank").alias("rank"))\
.orderBy("rank") \
.limit(top_n)
)
top_countries.show()
User provides 2012 as yr, 9 as month and 5 for top N countries.
# COUNTRIES THAT REACHED RANK #1 ALONG WITH MONTH AND YEAR
country_list = [row['country'] for row in rankings_reshaped.select("country").distinct().collect()]
print("\nSelect a country from the list below")
print("Available countries:\n")
for country in country_list:
print(f"- {country}")
selected_country = input("\nEnter the country name from the above list: ")
print(f'User selected "{selected_country}"')
rank1_count = rankings_reshaped.filter((col("country") == selected_country) & (col("rank") == 1)).count()
print(f'{selected_country} has been at no 1 for {rank1_count} times')
rank1 = (
rankings_reshaped.filter((col("country") == selected_country) & (col("rank") == 1))
.withColumn("month", month(col("date")))
.withColumn("year", year(col("date")))
.select("country", "month", "year")
)
rank1.show()
# NUMBER OF TIMES EACH COUNTRY RANKED #1