手记

Spark 可视化实战 分析日志的搜索引擎爬虫来源

1.日志部分内容:

66.249.79.35 - - [14/Jun/2018:06:45:24 +0000] "GET /img/20180504/702434-20180302101540805-554506523.jpg HTTP/1.1" 200 10013 "-" "Googlebot-Image/1.0"
66.249.79.35 - - [14/Jun/2018:06:45:25 +0000] "GET /img/20180504/702434-20180302161346635-1714710787.jpg HTTP/1.1" 200 45157 "-" "Googlebot-Image/1.0"
66.249.79.35 - - [14/Jun/2018:06:45:32 +0000] "GET /img/2018/05/21/89993124.jpg HTTP/1.1" 200 42160 "-" "Googlebot-Image/1.0"
66.249.79.35 - - [14/Jun/2018:06:45:32 +0000] "GET /archives/148618 HTTP/1.1" 200 8932 "-" "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.96 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
54.36.148.126 - - [14/Jun/2018:06:45:33 +0000] "GET /archives/91429 HTTP/1.1" 200 8223 "-" "Mozilla/5.0 (compatible; AhrefsBot/5.2; +http://ahrefs.com/robot/)"
54.36.149.31 - - [14/Jun/2018:06:45:34 +0000] "GET /?s=Community HTTP/1.1" 200 6741 "-" "Mozilla/5.0 (compatible; AhrefsBot/5.2; +http://ahrefs.com/robot/)"
66.249.79.35 - - [14/Jun/2018:06:45:40 +0000] "GET /img/20180505/1018770-20180131142516171-907427428.jpg HTTP/1.1" 200 8652 "-" "Googlebot-Image/1.0"
5.255.250.200 - - [14/Jun/2018:06:45:46 +0000] "GET /robots.txt HTTP/1.1" 200 445 "-" "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)"
66.249.79.35 - - [14/Jun/2018:06:45:46 +0000] "GET /archives/148211 HTTP/1.1" 200 8514 "-" "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.96 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
66.249.79.35 - - [14/Jun/2018:06:45:48 +0000] "GET /img/2018/05/20/1339446-20180517152850212-272519877.jpg HTTP/1.1" 200 124550 "-" "Googlebot-Image/1.0"
220.181.108.147 - - [14/Jun/2018:06:45:52 +0000] "GET /img/20180407/592104-20180302134147548-901544498.jpg HTTP/1.1" 404 22994 "-" "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36"
66.249.79.35 - - [14/Jun/2018:06:45:56 +0000] "GET /img/2018/05/21/60662344.jpg HTTP/1.1" 200 14133 "-" "Googlebot-Image/1.0"
66.249.79.35 - - [14/Jun/2018:06:46:00 +0000] "GET /archives/119633 HTTP/1.1" 200 9306 "-" "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.96 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
54.36.148.129 - - [14/Jun/2018:06:46:01 +0000] "GET /archives/91007 HTTP/1.1" 200 8332 "-" "Mozilla/5.0 (compatible; AhrefsBot/5.2; +http://ahrefs.com/robot/)"
54.36.148.201 - - [14/Jun/2018:06:46:03 +0000] "GET /archives/88741/feed HTTP/1.1" 200 983 "-" "Mozilla/5.0 (compatible; AhrefsBot/5.2; +http://ahrefs.com/robot/)"
5.255.250.200 - - [14/Jun/2018:06:46:03 +0000] "GET /archives/87084 HTTP/1.1" 200 9951 "-" "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)"

2.spark 数据清洗统计把结果插入到mysql

package com.codeblogbt

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._

import scala.collection.mutable.ListBuffer

object SparkStatFormatJob {


  def main(args: Array[String]): Unit = {


      val spark = SparkSession.builder().appName("SparkStatFormatJob")
        .master("local[2]").getOrCreate()

//      val access = spark.sparkContext.textFile("hdfs://localhost:9000/user/walle/access.log.3")
val access = spark.sparkContext.textFile("file:///Users/walle/Documents/D2/log/apache2/access.log.3")
//       access.take(10).foreach(println)
//    val result = access.filter(line => line.contains("archives")).collect()
//    val baiduCount = access.filter(line => line.contains("archives") && line.contains("Googlebot")).collect().size

    val filterRobot = access.filter(line => line.contains("archives"))

    import spark.implicits._
    val visitDF = filterRobot.map(line =>{
            val splits = line.split(" ")
            val ip = splits(0)
            val time = splits(3) + " " + splits(4)

            val url = splits(6)
            var archivesId = 0
            if(url.contains("archives")){
              try{
                archivesId = url.substring(url.lastIndexOf('/') + 1, url.length).toInt
              }catch {
                case e: Exception => {
                  e.printStackTrace()
                }
              }
            }
            val statusCode = splits(8).toInt
            val traffic = splits(9).toLong
            var robotId = 0
            if(line.contains("Googlebot")){
              robotId = 3
            }else if(line.contains("Baiduspider")){
              robotId = 4
            }else if(line.contains("Yandex")){
              robotId = 1
            }else if(line.contains("ahrefs")){
              robotId = 2
            }else if(line.contains("ia_archiver")){
              robotId = 5
            }

         VisitInfo(ip, DateUtils.parse(time),archivesId, statusCode,traffic,robotId)
       }).filter(info => info.archivesId != 0).toDF()

//      visitDF.show()
      val resultDF = visitDF.groupBy("robotId").agg(count("archivesId").as("id_count"))

      val robotDF = Seq((0, "people"),(1,"Yandex"),(2,"Ahrefs"),(3,"Google"),(4,"Baidu"),(5,"ia_archiver"))
          .toDF("id", "robot_name")

      val joinDF = resultDF.join(robotDF, resultDF.col("robotId") === robotDF.col("id"))
//      joinDF.show()

    joinDF.foreachPartition(partitionOfRecords =>{
      val list = new ListBuffer[RobotVisitInfo]
      partitionOfRecords.foreach(info =>{
        val id = info.getAs[Int]("id")
        val robot_name = info.getAs[String]("robot_name")
        val count = info.getAs[Long]("id_count")
        list.append(RobotVisitInfo(id,robot_name, count))
      })
      MysqlAction.insertTopRobot(list)

    })

//    resultDF.show()
      spark.stop()
  }

}

3. 用echarts 对mysql的结果进行可视化展示


http://www.waitingfy.com/archives/4399

原文出处

0人推荐
随时随地看视频
慕课网APP