{ "cells": [ { "cell_type": "code", "execution_count": 3, "id": "1cb824b4-398d-45b0-bc05-fc855b390f6c", "metadata": {}, "outputs": [], "source": [ "from pyspark.sql import SparkSession, functions as F\n", "\n", "spark = SparkSession.builder.appName(\"some_name\").getOrCreate()" ] }, { "cell_type": "code", "execution_count": 21, "id": "a8955f86-1640-455d-b7cf-efa7143829dc", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-------------------------------------------+\n", "|message |\n", "+-------------------------------------------+\n", "|Database connection failed |\n", "|Timeout while waiting for response response|\n", "|NullPointerException |\n", "+-------------------------------------------+\n", "\n" ] } ], "source": [ "df = spark.read.text(\"logs.txt\")\n", "\n", "errors_df = df.filter(F.col(\"value\").startswith(\"ERROR \"))\n", "\n", "parts = F.split(F.col(\"value\"), \" \", 4)\n", "messages_df = errors_df.select(parts[3].alias(\"message\"))\n", "\n", "messages_df.show(truncate=False)" ] }, { "cell_type": "code", "execution_count": 22, "id": "ac16086b-6ac3-4976-97ce-d9cd3d90ff59", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+\n", "| word|\n", "+--------------------+\n", "| database|\n", "| connection|\n", "| failed|\n", "| timeout|\n", "| while|\n", "| waiting|\n", "| for|\n", "| response|\n", "| response|\n", "|nullpointerexception|\n", "+--------------------+\n", "\n" ] } ], "source": [ "words = F.split(F.lower(F.col(\"message\")), \" \")\n", "words_df = messages_df.select(F.explode(words).alias(\"word\"))\n", "words_df.show()" ] }, { "cell_type": "code", "execution_count": 23, "id": "058c2e5c-fae8-482c-b628-3922eaebd2f5", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+-----+\n", "| word|count|\n", "+--------------------+-----+\n", "| response| 2|\n", "|nullpointerexception| 1|\n", "| for| 1|\n", "| failed| 1|\n", "| timeout| 1|\n", "| database| 1|\n", "| while| 1|\n", "| waiting| 1|\n", "| connection| 1|\n", "+--------------------+-----+\n", "\n" ] } ], "source": [ "result = (\n", " words_df\n", " .groupBy(\"word\")\n", " .count()\n", " .orderBy(F.desc(\"count\"))\n", ")\n", "result.show()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.3" } }, "nbformat": 4, "nbformat_minor": 5 }