commit cd69553c993f14afd0641e58630a4b5070bd5df6 Author: nazarenko.ae Date: Sun Jun 7 13:48:55 2026 +0700 feat: add homework diff --git a/homework/task1.ipynb b/homework/task1.ipynb new file mode 100644 index 0000000..e52f512 --- /dev/null +++ b/homework/task1.ipynb @@ -0,0 +1,83 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "f62f45bf-8549-4d60-b90f-270ed1c142b5", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Setting default log level to \"WARN\".\n", + "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", + "26/06/06 09:07:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Активные Spark сессии: http://5e7ed2c4667c:4040\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "26/06/06 09:07:29 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors\n" + ] + } + ], + "source": [ + "from pyspark.sql import SparkSession\n", + "\n", + "spark = SparkSession.builder.appName(\"test_pyspark_local\").getOrCreate()\n", + "print(\"Активные Spark сессии:\", spark.sparkContext.uiWebUrl)\n", + "\n", + "# 1\n", + "logs_rdd = spark.sparkContext.textFile(\"logs.txt\")\n", + "# 2 - transformation\n", + "errors_rdd = logs_rdd.filter(lambda line: \"ERROR\" in line)\n", + "# 3 - transformation\n", + "erros_upper_rdd = errors_rdd.map(lambda line: line.upper())\n", + "# 4 - transformation\n", + "error_count = erros_upper_rdd.count()\n", + "# 5 - action\n", + "erros_upper_rdd.saveAsTextFile(\"errors_upper.txt\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "d95abd88-700e-4615-8a7a-98a4f9b4877d", + "metadata": {}, + "outputs": [], + "source": [ + "df = spark.read.text(\"logs.txt\")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/homework/task2.ipynb b/homework/task2.ipynb new file mode 100644 index 0000000..6bc8b1c --- /dev/null +++ b/homework/task2.ipynb @@ -0,0 +1,151 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 26, + "id": "ed083f23-2556-4577-b770-36a087bf4566", + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql import SparkSession\n", + "import pyspark.sql.functions as F\n", + "\n", + "spark = SparkSession.builder.appName(\"some_name\").getOrCreate()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "a9ae7a2b-7a96-4816-8b2d-8bf6b72aaffd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---------+----------+--------+----------------------------------+\n", + "|log_level|date |time |message |\n", + "+---------+----------+--------+----------------------------------+\n", + "|INFO |2025-07-30|12:00:00|User logged in |\n", + "|DEBUG |2025-07-30|12:01:00|Session started |\n", + "|ERROR |2025-07-30|12:02:00|Database connection failed |\n", + "|INFO |2025-07-30|12:03:00|User clicked button |\n", + "|ERROR |2025-07-30|12:04:00|Timeout while waiting for response|\n", + "|DEBUG |2025-07-30|12:05:00|Memory usage: 75% |\n", + "|ERROR |2025-07-30|12:06:00|NullPointerException |\n", + "+---------+----------+--------+----------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "parts = F.split(F.col(\"value\"), \" \", 4)\n", + "df_logs = spark.read.text(\"logs.txt\").select(\n", + " parts[0].alias(\"log_level\"),\n", + " parts[1].alias(\"date\"),\n", + " parts[2].alias(\"time\"),\n", + " parts[3].alias(\"message\")\n", + ")\n", + "\n", + "df_logs.show(truncate=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "31c2f26f-9a47-4a60-9d46-de6c5742b524", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---------+----------+--------+----------------------------------+\n", + "|log_level|date |time |message |\n", + "+---------+----------+--------+----------------------------------+\n", + "|ERROR |2025-07-30|12:02:00|Database connection failed |\n", + "|ERROR |2025-07-30|12:04:00|Timeout while waiting for response|\n", + "|ERROR |2025-07-30|12:06:00|NullPointerException |\n", + "+---------+----------+--------+----------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "df_logs_error = df_logs.filter(F.col(\"log_level\") == \"ERROR\")\n", + "df_logs_error.show(truncate=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "e3d10e1d-3c62-4c98-9404-81c9aa54d5f0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---------+----------+--------+----------------------------------+----+\n", + "|log_level|date |time |message |hour|\n", + "+---------+----------+--------+----------------------------------+----+\n", + "|ERROR |2025-07-30|12:02:00|Database connection failed |12 |\n", + "|ERROR |2025-07-30|12:04:00|Timeout while waiting for response|12 |\n", + "|ERROR |2025-07-30|12:06:00|NullPointerException |12 |\n", + "+---------+----------+--------+----------------------------------+----+\n", + "\n" + ] + } + ], + "source": [ + "df_logs_error = df_logs_error.withColumn(\"hour\", F.substring(\"time\", 1, 2).cast(\"int\"))\n", + "df_logs_error.show(truncate=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "fd6d5a08-e464-4d62-bd93-7f118db53f6e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+----+-----+\n", + "|hour|count|\n", + "+----+-----+\n", + "| 12| 3|\n", + "+----+-----+\n", + "\n" + ] + } + ], + "source": [ + "df_count_errors_by_hour = df_logs_error.groupBy(\"hour\").count()\n", + "df_count_errors_by_hour.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/homework/task3.ipynb b/homework/task3.ipynb new file mode 100644 index 0000000..c0699e9 --- /dev/null +++ b/homework/task3.ipynb @@ -0,0 +1,113 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 5, + "id": "e3cf6fb7-e4e7-4cd7-baa7-03209328631c", + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql import SparkSession\n", + "import pyspark.sql.functions as F\n", + "\n", + "spark = SparkSession.builder.appName(\"some_name\").getOrCreate()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f0f1fb08-2373-48b2-99d9-1424a4ceb0c0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---------+\n", + "|log_level|\n", + "+---------+\n", + "|INFO |\n", + "|DEBUG |\n", + "|ERROR |\n", + "|INFO |\n", + "|ERROR |\n", + "|DEBUG |\n", + "|ERROR |\n", + "+---------+\n", + "\n" + ] + } + ], + "source": [ + "parts = F.split(F.col(\"value\"), \" \", 2)\n", + "df_logs = spark.read.text(\"logs.txt\").select(parts[0].alias(\"log_level\"))\n", + "df_logs.show(truncate=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "6d784b52-9369-4051-8084-a7f705c001fa", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_error_logs = df_logs.filter(F.col(\"log_level\") == \"ERROR\").count()\n", + "count_error" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "592dc010-5700-4cc1-b0bf-8738c3b1567e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.42857142857142855" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "total = df_logs.count()\n", + "count_error / total" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/homework/task4.ipynb b/homework/task4.ipynb new file mode 100644 index 0000000..c9bbbae --- /dev/null +++ b/homework/task4.ipynb @@ -0,0 +1,140 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "id": "1cb824b4-398d-45b0-bc05-fc855b390f6c", + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql import SparkSession, functions as F\n", + "\n", + "spark = SparkSession.builder.appName(\"some_name\").getOrCreate()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "a8955f86-1640-455d-b7cf-efa7143829dc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------------------------------------------+\n", + "|message |\n", + "+-------------------------------------------+\n", + "|Database connection failed |\n", + "|Timeout while waiting for response response|\n", + "|NullPointerException |\n", + "+-------------------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "df = spark.read.text(\"logs.txt\")\n", + "\n", + "errors_df = df.filter(F.col(\"value\").startswith(\"ERROR \"))\n", + "\n", + "parts = F.split(F.col(\"value\"), \" \", 4)\n", + "messages_df = errors_df.select(parts[3].alias(\"message\"))\n", + "\n", + "messages_df.show(truncate=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "ac16086b-6ac3-4976-97ce-d9cd3d90ff59", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+\n", + "| word|\n", + "+--------------------+\n", + "| database|\n", + "| connection|\n", + "| failed|\n", + "| timeout|\n", + "| while|\n", + "| waiting|\n", + "| for|\n", + "| response|\n", + "| response|\n", + "|nullpointerexception|\n", + "+--------------------+\n", + "\n" + ] + } + ], + "source": [ + "words = F.split(F.lower(F.col(\"message\")), \" \")\n", + "words_df = messages_df.select(F.explode(words).alias(\"word\"))\n", + "words_df.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "058c2e5c-fae8-482c-b628-3922eaebd2f5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+-----+\n", + "| word|count|\n", + "+--------------------+-----+\n", + "| response| 2|\n", + "|nullpointerexception| 1|\n", + "| for| 1|\n", + "| failed| 1|\n", + "| timeout| 1|\n", + "| database| 1|\n", + "| while| 1|\n", + "| waiting| 1|\n", + "| connection| 1|\n", + "+--------------------+-----+\n", + "\n" + ] + } + ], + "source": [ + "result = (\n", + " words_df\n", + " .groupBy(\"word\")\n", + " .count()\n", + " .orderBy(F.desc(\"count\"))\n", + ")\n", + "result.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/homework/task5.ipynb b/homework/task5.ipynb new file mode 100644 index 0000000..021f1ec --- /dev/null +++ b/homework/task5.ipynb @@ -0,0 +1,121 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "id": "53b1885e-2dff-41ec-9a50-3f1bba298349", + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql import SparkSession, functions as F\n", + "\n", + "spark = SparkSession.builder.appName(\"some_name\").getOrCreate()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "61b1708d-9227-4200-afce-be808286e6b4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+------------------------------------------------+\n", + "|message |\n", + "+------------------------------------------------+\n", + "|Database connection failed |\n", + "|Timeout while waiting for response response user|\n", + "|NullPointerException |\n", + "+------------------------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "df = spark.read.text(\"logs.txt\")\n", + "\n", + "errors_df = df.filter(F.col(\"value\").startswith(\"ERROR \"))\n", + "\n", + "parts = F.split(F.col(\"value\"), \" \", 4)\n", + "messages_df = errors_df.select(parts[3].alias(\"message\"))\n", + "\n", + "messages_df.show(truncate=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "0ee49042-a0d8-4f2a-b9bf-9818dad3c61f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+------------------------------------------------+\n", + "|message |\n", + "+------------------------------------------------+\n", + "|database connection failed |\n", + "|timeout while waiting for response response user|\n", + "|nullpointerexception |\n", + "+------------------------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "lower_messages_df = messages_df.select(F.lower(F.col(\"message\")).alias(\"message\"))\n", + "lower_messages_df.show(truncate=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "ed957bf8-6ba2-452c-85d7-28162452ed2b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+------------------------------------------------+---------------+\n", + "|message |user_in_message|\n", + "+------------------------------------------------+---------------+\n", + "|database connection failed |false |\n", + "|timeout while waiting for response response user|true |\n", + "|nullpointerexception |false |\n", + "+------------------------------------------------+---------------+\n", + "\n" + ] + } + ], + "source": [ + "lower_messages_df = lower_messages_df.withColumn(\"user_in_message\", F.col(\"message\").contains(\"user\"))\n", + "lower_messages_df.show(truncate=False)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}