1.日志部分内容:
66.249.79.35 - - [14/Jun/2018:06:45:24 +0000] "GET /img/20180504/702434-20180302101540805-554506523.jpg HTTP/1.1" 200 10013 "-" "Googlebot-Image/1.0"
66.249.79.35 - - [14/Jun/2018:06:45:25 +0000] "GET /img/20180504/702434-20180302161346635-1714710787.jpg HTTP/1.1" 200 45157 "-" "Googlebot-Image/1.0"
66.249.79.35 - - [14/Jun/2018:06:45:32 +0000] "GET /img/2018/05/21/89993124.jpg HTTP/1.1" 200 42160 "-" "Googlebot-Image/1.0"
66.249.79.35 - - [14/Jun/2018:06:45:32 +0000] "GET /archives/148618 HTTP/1.1" 200 8932 "-" "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.96 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
54.36.148.126 - - [14/Jun/2018:06:45:33 +0000] "GET /archives/91429 HTTP/1.1" 200 8223 "-" "Mozilla/5.0 (compatible; AhrefsBot/5.2; +http://ahrefs.com/robot/)"
54.36.149.31 - - [14/Jun/2018:06:45:34 +0000] "GET /?s=Community HTTP/1.1" 200 6741 "-" "Mozilla/5.0 (compatible; AhrefsBot/5.2; +http://ahrefs.com/robot/)"
66.249.79.35 - - [14/Jun/2018:06:45:40 +0000] "GET /img/20180505/1018770-20180131142516171-907427428.jpg HTTP/1.1" 200 8652 "-" "Googlebot-Image/1.0"
5.255.250.200 - - [14/Jun/2018:06:45:46 +0000] "GET /robots.txt HTTP/1.1" 200 445 "-" "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)"
66.249.79.35 - - [14/Jun/2018:06:45:46 +0000] "GET /archives/148211 HTTP/1.1" 200 8514 "-" "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.96 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
66.249.79.35 - - [14/Jun/2018:06:45:48 +0000] "GET /img/2018/05/20/1339446-20180517152850212-272519877.jpg HTTP/1.1" 200 124550 "-" "Googlebot-Image/1.0"
220.181.108.147 - - [14/Jun/2018:06:45:52 +0000] "GET /img/20180407/592104-20180302134147548-901544498.jpg HTTP/1.1" 404 22994 "-" "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36"
66.249.79.35 - - [14/Jun/2018:06:45:56 +0000] "GET /img/2018/05/21/60662344.jpg HTTP/1.1" 200 14133 "-" "Googlebot-Image/1.0"
66.249.79.35 - - [14/Jun/2018:06:46:00 +0000] "GET /archives/119633 HTTP/1.1" 200 9306 "-" "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.96 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
54.36.148.129 - - [14/Jun/2018:06:46:01 +0000] "GET /archives/91007 HTTP/1.1" 200 8332 "-" "Mozilla/5.0 (compatible; AhrefsBot/5.2; +http://ahrefs.com/robot/)"
54.36.148.201 - - [14/Jun/2018:06:46:03 +0000] "GET /archives/88741/feed HTTP/1.1" 200 983 "-" "Mozilla/5.0 (compatible; AhrefsBot/5.2; +http://ahrefs.com/robot/)"
5.255.250.200 - - [14/Jun/2018:06:46:03 +0000] "GET /archives/87084 HTTP/1.1" 200 9951 "-" "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)"
2.spark 数据清洗统计把结果插入到mysql
package com.codeblogbt
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import scala.collection.mutable.ListBuffer
object SparkStatFormatJob {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName("SparkStatFormatJob")
.master("local[2]").getOrCreate()
// val access = spark.sparkContext.textFile("hdfs://localhost:9000/user/walle/access.log.3")
val access = spark.sparkContext.textFile("file:///Users/walle/Documents/D2/log/apache2/access.log.3")
// access.take(10).foreach(println)
// val result = access.filter(line => line.contains("archives")).collect()
// val baiduCount = access.filter(line => line.contains("archives") && line.contains("Googlebot")).collect().size
val filterRobot = access.filter(line => line.contains("archives"))
import spark.implicits._
val visitDF = filterRobot.map(line =>{
val splits = line.split(" ")
val ip = splits(0)
val time = splits(3) + " " + splits(4)
val url = splits(6)
var archivesId = 0
if(url.contains("archives")){
try{
archivesId = url.substring(url.lastIndexOf('/') + 1, url.length).toInt
}catch {
case e: Exception => {
e.printStackTrace()
}
}
}
val statusCode = splits(8).toInt
val traffic = splits(9).toLong
var robotId = 0
if(line.contains("Googlebot")){
robotId = 3
}else if(line.contains("Baiduspider")){
robotId = 4
}else if(line.contains("Yandex")){
robotId = 1
}else if(line.contains("ahrefs")){
robotId = 2
}else if(line.contains("ia_archiver")){
robotId = 5
}
VisitInfo(ip, DateUtils.parse(time),archivesId, statusCode,traffic,robotId)
}).filter(info => info.archivesId != 0).toDF()
// visitDF.show()
val resultDF = visitDF.groupBy("robotId").agg(count("archivesId").as("id_count"))
val robotDF = Seq((0, "people"),(1,"Yandex"),(2,"Ahrefs"),(3,"Google"),(4,"Baidu"),(5,"ia_archiver"))
.toDF("id", "robot_name")
val joinDF = resultDF.join(robotDF, resultDF.col("robotId") === robotDF.col("id"))
// joinDF.show()
joinDF.foreachPartition(partitionOfRecords =>{
val list = new ListBuffer[RobotVisitInfo]
partitionOfRecords.foreach(info =>{
val id = info.getAs[Int]("id")
val robot_name = info.getAs[String]("robot_name")
val count = info.getAs[Long]("id_count")
list.append(RobotVisitInfo(id,robot_name, count))
})
MysqlAction.insertTopRobot(list)
})
// resultDF.show()
spark.stop()
}
}
3. 用echarts 对mysql的结果进行可视化展示
http://www.waitingfy.com/archives/4399
原文出处
打开App,阅读手记