feat: add homework

This commit is contained in:
nazarenko.ae
2026-06-07 13:48:55 +07:00
commit cd69553c99
5 changed files with 608 additions and 0 deletions

83
homework/task1.ipynb Normal file
View File

@@ -0,0 +1,83 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "f62f45bf-8549-4d60-b90f-270ed1c142b5",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Setting default log level to \"WARN\".\n",
"To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
"26/06/06 09:07:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Активные Spark сессии: http://5e7ed2c4667c:4040\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"26/06/06 09:07:29 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors\n"
]
}
],
"source": [
"from pyspark.sql import SparkSession\n",
"\n",
"spark = SparkSession.builder.appName(\"test_pyspark_local\").getOrCreate()\n",
"print(\"Активные Spark сессии:\", spark.sparkContext.uiWebUrl)\n",
"\n",
"# 1\n",
"logs_rdd = spark.sparkContext.textFile(\"logs.txt\")\n",
"# 2 - transformation\n",
"errors_rdd = logs_rdd.filter(lambda line: \"ERROR\" in line)\n",
"# 3 - transformation\n",
"erros_upper_rdd = errors_rdd.map(lambda line: line.upper())\n",
"# 4 - transformation\n",
"error_count = erros_upper_rdd.count()\n",
"# 5 - action\n",
"erros_upper_rdd.saveAsTextFile(\"errors_upper.txt\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "d95abd88-700e-4615-8a7a-98a4f9b4877d",
"metadata": {},
"outputs": [],
"source": [
"df = spark.read.text(\"logs.txt\")\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

151
homework/task2.ipynb Normal file
View File

@@ -0,0 +1,151 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 26,
"id": "ed083f23-2556-4577-b770-36a087bf4566",
"metadata": {},
"outputs": [],
"source": [
"from pyspark.sql import SparkSession\n",
"import pyspark.sql.functions as F\n",
"\n",
"spark = SparkSession.builder.appName(\"some_name\").getOrCreate()\n"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "a9ae7a2b-7a96-4816-8b2d-8bf6b72aaffd",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+---------+----------+--------+----------------------------------+\n",
"|log_level|date |time |message |\n",
"+---------+----------+--------+----------------------------------+\n",
"|INFO |2025-07-30|12:00:00|User logged in |\n",
"|DEBUG |2025-07-30|12:01:00|Session started |\n",
"|ERROR |2025-07-30|12:02:00|Database connection failed |\n",
"|INFO |2025-07-30|12:03:00|User clicked button |\n",
"|ERROR |2025-07-30|12:04:00|Timeout while waiting for response|\n",
"|DEBUG |2025-07-30|12:05:00|Memory usage: 75% |\n",
"|ERROR |2025-07-30|12:06:00|NullPointerException |\n",
"+---------+----------+--------+----------------------------------+\n",
"\n"
]
}
],
"source": [
"parts = F.split(F.col(\"value\"), \" \", 4)\n",
"df_logs = spark.read.text(\"logs.txt\").select(\n",
" parts[0].alias(\"log_level\"),\n",
" parts[1].alias(\"date\"),\n",
" parts[2].alias(\"time\"),\n",
" parts[3].alias(\"message\")\n",
")\n",
"\n",
"df_logs.show(truncate=False)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "31c2f26f-9a47-4a60-9d46-de6c5742b524",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+---------+----------+--------+----------------------------------+\n",
"|log_level|date |time |message |\n",
"+---------+----------+--------+----------------------------------+\n",
"|ERROR |2025-07-30|12:02:00|Database connection failed |\n",
"|ERROR |2025-07-30|12:04:00|Timeout while waiting for response|\n",
"|ERROR |2025-07-30|12:06:00|NullPointerException |\n",
"+---------+----------+--------+----------------------------------+\n",
"\n"
]
}
],
"source": [
"df_logs_error = df_logs.filter(F.col(\"log_level\") == \"ERROR\")\n",
"df_logs_error.show(truncate=False)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "e3d10e1d-3c62-4c98-9404-81c9aa54d5f0",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+---------+----------+--------+----------------------------------+----+\n",
"|log_level|date |time |message |hour|\n",
"+---------+----------+--------+----------------------------------+----+\n",
"|ERROR |2025-07-30|12:02:00|Database connection failed |12 |\n",
"|ERROR |2025-07-30|12:04:00|Timeout while waiting for response|12 |\n",
"|ERROR |2025-07-30|12:06:00|NullPointerException |12 |\n",
"+---------+----------+--------+----------------------------------+----+\n",
"\n"
]
}
],
"source": [
"df_logs_error = df_logs_error.withColumn(\"hour\", F.substring(\"time\", 1, 2).cast(\"int\"))\n",
"df_logs_error.show(truncate=False)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "fd6d5a08-e464-4d62-bd93-7f118db53f6e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+----+-----+\n",
"|hour|count|\n",
"+----+-----+\n",
"| 12| 3|\n",
"+----+-----+\n",
"\n"
]
}
],
"source": [
"df_count_errors_by_hour = df_logs_error.groupBy(\"hour\").count()\n",
"df_count_errors_by_hour.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

113
homework/task3.ipynb Normal file
View File

@@ -0,0 +1,113 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 5,
"id": "e3cf6fb7-e4e7-4cd7-baa7-03209328631c",
"metadata": {},
"outputs": [],
"source": [
"from pyspark.sql import SparkSession\n",
"import pyspark.sql.functions as F\n",
"\n",
"spark = SparkSession.builder.appName(\"some_name\").getOrCreate()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f0f1fb08-2373-48b2-99d9-1424a4ceb0c0",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+---------+\n",
"|log_level|\n",
"+---------+\n",
"|INFO |\n",
"|DEBUG |\n",
"|ERROR |\n",
"|INFO |\n",
"|ERROR |\n",
"|DEBUG |\n",
"|ERROR |\n",
"+---------+\n",
"\n"
]
}
],
"source": [
"parts = F.split(F.col(\"value\"), \" \", 2)\n",
"df_logs = spark.read.text(\"logs.txt\").select(parts[0].alias(\"log_level\"))\n",
"df_logs.show(truncate=False)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "6d784b52-9369-4051-8084-a7f705c001fa",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"3"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_error_logs = df_logs.filter(F.col(\"log_level\") == \"ERROR\").count()\n",
"count_error"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "592dc010-5700-4cc1-b0bf-8738c3b1567e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.42857142857142855"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"total = df_logs.count()\n",
"count_error / total"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

140
homework/task4.ipynb Normal file
View File

@@ -0,0 +1,140 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"id": "1cb824b4-398d-45b0-bc05-fc855b390f6c",
"metadata": {},
"outputs": [],
"source": [
"from pyspark.sql import SparkSession, functions as F\n",
"\n",
"spark = SparkSession.builder.appName(\"some_name\").getOrCreate()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "a8955f86-1640-455d-b7cf-efa7143829dc",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-------------------------------------------+\n",
"|message |\n",
"+-------------------------------------------+\n",
"|Database connection failed |\n",
"|Timeout while waiting for response response|\n",
"|NullPointerException |\n",
"+-------------------------------------------+\n",
"\n"
]
}
],
"source": [
"df = spark.read.text(\"logs.txt\")\n",
"\n",
"errors_df = df.filter(F.col(\"value\").startswith(\"ERROR \"))\n",
"\n",
"parts = F.split(F.col(\"value\"), \" \", 4)\n",
"messages_df = errors_df.select(parts[3].alias(\"message\"))\n",
"\n",
"messages_df.show(truncate=False)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "ac16086b-6ac3-4976-97ce-d9cd3d90ff59",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+--------------------+\n",
"| word|\n",
"+--------------------+\n",
"| database|\n",
"| connection|\n",
"| failed|\n",
"| timeout|\n",
"| while|\n",
"| waiting|\n",
"| for|\n",
"| response|\n",
"| response|\n",
"|nullpointerexception|\n",
"+--------------------+\n",
"\n"
]
}
],
"source": [
"words = F.split(F.lower(F.col(\"message\")), \" \")\n",
"words_df = messages_df.select(F.explode(words).alias(\"word\"))\n",
"words_df.show()"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "058c2e5c-fae8-482c-b628-3922eaebd2f5",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+--------------------+-----+\n",
"| word|count|\n",
"+--------------------+-----+\n",
"| response| 2|\n",
"|nullpointerexception| 1|\n",
"| for| 1|\n",
"| failed| 1|\n",
"| timeout| 1|\n",
"| database| 1|\n",
"| while| 1|\n",
"| waiting| 1|\n",
"| connection| 1|\n",
"+--------------------+-----+\n",
"\n"
]
}
],
"source": [
"result = (\n",
" words_df\n",
" .groupBy(\"word\")\n",
" .count()\n",
" .orderBy(F.desc(\"count\"))\n",
")\n",
"result.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

121
homework/task5.ipynb Normal file
View File

@@ -0,0 +1,121 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "53b1885e-2dff-41ec-9a50-3f1bba298349",
"metadata": {},
"outputs": [],
"source": [
"from pyspark.sql import SparkSession, functions as F\n",
"\n",
"spark = SparkSession.builder.appName(\"some_name\").getOrCreate()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "61b1708d-9227-4200-afce-be808286e6b4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+------------------------------------------------+\n",
"|message |\n",
"+------------------------------------------------+\n",
"|Database connection failed |\n",
"|Timeout while waiting for response response user|\n",
"|NullPointerException |\n",
"+------------------------------------------------+\n",
"\n"
]
}
],
"source": [
"df = spark.read.text(\"logs.txt\")\n",
"\n",
"errors_df = df.filter(F.col(\"value\").startswith(\"ERROR \"))\n",
"\n",
"parts = F.split(F.col(\"value\"), \" \", 4)\n",
"messages_df = errors_df.select(parts[3].alias(\"message\"))\n",
"\n",
"messages_df.show(truncate=False)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "0ee49042-a0d8-4f2a-b9bf-9818dad3c61f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+------------------------------------------------+\n",
"|message |\n",
"+------------------------------------------------+\n",
"|database connection failed |\n",
"|timeout while waiting for response response user|\n",
"|nullpointerexception |\n",
"+------------------------------------------------+\n",
"\n"
]
}
],
"source": [
"lower_messages_df = messages_df.select(F.lower(F.col(\"message\")).alias(\"message\"))\n",
"lower_messages_df.show(truncate=False)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "ed957bf8-6ba2-452c-85d7-28162452ed2b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+------------------------------------------------+---------------+\n",
"|message |user_in_message|\n",
"+------------------------------------------------+---------------+\n",
"|database connection failed |false |\n",
"|timeout while waiting for response response user|true |\n",
"|nullpointerexception |false |\n",
"+------------------------------------------------+---------------+\n",
"\n"
]
}
],
"source": [
"lower_messages_df = lower_messages_df.withColumn(\"user_in_message\", F.col(\"message\").contains(\"user\"))\n",
"lower_messages_df.show(truncate=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}