{ "cells": [ { "cell_type": "code", "execution_count": 2, "id": "53b1885e-2dff-41ec-9a50-3f1bba298349", "metadata": {}, "outputs": [], "source": [ "from pyspark.sql import SparkSession, functions as F\n", "\n", "spark = SparkSession.builder.appName(\"some_name\").getOrCreate()" ] }, { "cell_type": "code", "execution_count": 12, "id": "61b1708d-9227-4200-afce-be808286e6b4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+------------------------------------------------+\n", "|message |\n", "+------------------------------------------------+\n", "|Database connection failed |\n", "|Timeout while waiting for response response user|\n", "|NullPointerException |\n", "+------------------------------------------------+\n", "\n" ] } ], "source": [ "df = spark.read.text(\"logs.txt\")\n", "\n", "errors_df = df.filter(F.col(\"value\").startswith(\"ERROR \"))\n", "\n", "parts = F.split(F.col(\"value\"), \" \", 4)\n", "messages_df = errors_df.select(parts[3].alias(\"message\"))\n", "\n", "messages_df.show(truncate=False)" ] }, { "cell_type": "code", "execution_count": 13, "id": "0ee49042-a0d8-4f2a-b9bf-9818dad3c61f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+------------------------------------------------+\n", "|message |\n", "+------------------------------------------------+\n", "|database connection failed |\n", "|timeout while waiting for response response user|\n", "|nullpointerexception |\n", "+------------------------------------------------+\n", "\n" ] } ], "source": [ "lower_messages_df = messages_df.select(F.lower(F.col(\"message\")).alias(\"message\"))\n", "lower_messages_df.show(truncate=False)" ] }, { "cell_type": "code", "execution_count": 14, "id": "ed957bf8-6ba2-452c-85d7-28162452ed2b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+------------------------------------------------+---------------+\n", "|message |user_in_message|\n", "+------------------------------------------------+---------------+\n", "|database connection failed |false |\n", "|timeout while waiting for response response user|true |\n", "|nullpointerexception |false |\n", "+------------------------------------------------+---------------+\n", "\n" ] } ], "source": [ "lower_messages_df = lower_messages_df.withColumn(\"user_in_message\", F.col(\"message\").contains(\"user\"))\n", "lower_messages_df.show(truncate=False)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.3" } }, "nbformat": 4, "nbformat_minor": 5 }