大数据测试之日志分析1
Result 文件数据说明:
Ip:106.39.41.166,(城市)
Date:10/Nov/2016:00:01:02 +0800,(日期)
Day:10,(天数)
Traffic: 54 ,(流量)
Type: video,(类型:视频 video 或文章 article)
Id: 8701(视频或者文章的 id)
测试要求:
1、 数据清洗:按照进行数据清洗,并将清洗后的数据导入 MongDB 数据库中。
两阶段数据清洗:
(1)第一阶段:把需要的信息从原始日志中提取出来
ip: 199.30.25.88
time: 10/Nov/2016:00:01:03 +0800
traffic: 62
文章: article/11325
视频: video/3235
(2)第二阶段:根据提取出来的信息做精细化操作
ip---> 城市 city(IP)
date--> time:2016-11-10 00:01:03
day: 10
traffic:62
type:article/video
id:11325
(3)MongDB 数据库表结构:
create table data( ip string, time string , day string, traffic bigint,type string, id string)
清洗数据代码:
package mongotest3;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
public class CleanData {
<span class="hljs-keyword">public</span> <span class="hljs-keyword">static</span> <span class="hljs-keyword">void</span> <span class="hljs-title function_">main</span><span class="hljs-params">(String[] args)</span> <span class="hljs-keyword">throws</span> FileNotFoundException, UnsupportedEncodingException {
<span class="hljs-type">FileReader</span> <span class="hljs-variable">read</span> <span class="hljs-operator">=</span> <span class="hljs-keyword">new</span> <span class="hljs-title class_">FileReader</span>(<span class="hljs-string">"D:\\java\\eclipse-workplace\\mongotest3\\src\\mongotest3\\result.txt"</span>);
<span class="hljs-type">BufferedReader</span> <span class="hljs-variable">br</span> <span class="hljs-operator">=</span> <span class="hljs-keyword">new</span> <span class="hljs-title class_">BufferedReader</span>(read);
<span class="hljs-type">Writer</span> <span class="hljs-variable">writer</span> <span class="hljs-operator">=</span> <span class="hljs-literal">null</span>;
<span class="hljs-type">File</span> <span class="hljs-variable">outFile</span> <span class="hljs-operator">=</span> <span class="hljs-keyword">new</span> <span class="hljs-title class_">File</span>(<span class="hljs-string">"D:\\result2.txt"</span>);
writer = <span class="hljs-keyword">new</span> <span class="hljs-title class_">OutputStreamWriter</span>(<span class="hljs-keyword">new</span> <span class="hljs-title class_">FileOutputStream</span>(outFile),<span class="hljs-string">"utf-8"</span>);
<span class="hljs-type">BufferedWriter</span> <span class="hljs-variable">bw</span> <span class="hljs-operator">=</span> <span class="hljs-keyword">new</span> <span class="hljs-title class_">BufferedWriter</span>(writer);
String row;
String[] data=<span class="hljs-keyword">new</span> <span class="hljs-title class_">String</span>[<span class="hljs-number">6</span>];
<span class="hljs-type">int</span> hang=<span class="hljs-number">1</span>;
<span class="hljs-keyword">try</span> {
<span class="hljs-keyword">while</span>((row = br.readLine())!=<span class="hljs-literal">null</span>){
data=change(row);
data=chage(data);
<span class="hljs-keyword">for</span>(<span class="hljs-type">int</span> i=<span class="hljs-number">0</span>;i<data.length;i++) {
System.out.print(data[i]+<span class="hljs-string">"\t"</span>);
}
System.out.println();
row=data[<span class="hljs-number">0</span>]+<span class="hljs-string">","</span>+data[<span class="hljs-number">1</span>]+<span class="hljs-string">","</span>+data[<span class="hljs-number">2</span>]+<span class="hljs-string">","</span>+data[<span class="hljs-number">3</span>]+<span class="hljs-string">","</span>+data[<span class="hljs-number">4</span>]+<span class="hljs-string">","</span>+data[<span class="hljs-number">5</span>];
bw.write(row + <span class="hljs-string">"\r\n"</span>);
<span class="hljs-comment">//i++;</span>
}
} <span class="hljs-keyword">catch</span> (IOException e) {
<span class="hljs-comment">// TODO Auto-generated catch block</span>
e.printStackTrace();
}
}
<span class="hljs-keyword">private</span> <span class="hljs-keyword">static</span> String[] chage(String[] data) {
<span class="hljs-comment">/*
* for(int i=0;i<data.length;i++) { data[] }
*/</span>
data[<span class="hljs-number">0</span>]=data[<span class="hljs-number">0</span>];
<span class="hljs-type">char</span>[] str=data[<span class="hljs-number">1</span>].toCharArray();
String[] time=<span class="hljs-keyword">new</span> <span class="hljs-title class_">String</span>[<span class="hljs-number">7</span>];
<span class="hljs-type">int</span> j=<span class="hljs-number">0</span>;
<span class="hljs-type">int</span> k=<span class="hljs-number">0</span>;
<span class="hljs-keyword">for</span>(<span class="hljs-type">int</span> i=<span class="hljs-number">0</span>;i<str.length;i++) {
<span class="hljs-keyword">if</span>(str[i]==<span class="hljs-string">'/'</span>||str[i]==<span class="hljs-string">':'</span>||str[i]==<span class="hljs-number">32</span>) {
time[k]=data[<span class="hljs-number">1</span>].substring(j,i);
j=i+<span class="hljs-number">1</span>;
k++;
}
}
time[k]=data[<span class="hljs-number">1</span>].substring(j, data[<span class="hljs-number">1</span>].length());
<span class="hljs-keyword">switch</span>(time[<span class="hljs-number">1</span>]) { <span class="hljs-keyword">case</span> <span class="hljs-string">"Jan"</span>:time[<span class="hljs-number">1</span>]=<span class="hljs-string">"01"</span>;<span class="hljs-keyword">break</span>; <span class="hljs-keyword">case</span>
<span class="hljs-string">"Feb"</span>:time[<span class="hljs-number">1</span>]=<span class="hljs-string">"02"</span>;<span class="hljs-keyword">break</span>; <span class="hljs-keyword">case</span> <span class="hljs-string">"Mar"</span>:time[<span class="hljs-number">1</span>]=<span class="hljs-string">"03"</span>;<span class="hljs-keyword">break</span>; <span class="hljs-keyword">case</span>
<span class="hljs-string">"Apr"</span>:time[<span class="hljs-number">1</span>]=<span class="hljs-string">"04"</span>;<span class="hljs-keyword">break</span>; <span class="hljs-keyword">case</span> <span class="hljs-string">"May"</span>:time[<span class="hljs-number">1</span>]=<span class="hljs-string">"05"</span>;<span class="hljs-keyword">break</span>; <span class="hljs-keyword">case</span>
<span class="hljs-string">"Jun"</span>:time[<span class="hljs-number">1</span>]=<span class="hljs-string">"06"</span>;<span class="hljs-keyword">break</span>; <span class="hljs-keyword">case</span> <span class="hljs-string">"Jul"</span>:time[<span class="hljs-number">1</span>]=<span class="hljs-string">"07"</span>;<span class="hljs-keyword">break</span>; <span class="hljs-keyword">case</span>
<span class="hljs-string">"Aug"</span>:time[<span class="hljs-number">1</span>]=<span class="hljs-string">"08"</span>;<span class="hljs-keyword">break</span>; <span class="hljs-keyword">case</span> <span class="hljs-string">"Sep"</span>:time[<span class="hljs-number">1</span>]=<span class="hljs-string">"09"</span>;<span class="hljs-keyword">break</span>; <span class="hljs-keyword">case</span>
<span class="hljs-string">"Oct"</span>:time[<span class="hljs-number">1</span>]=<span class="hljs-string">"10"</span>;<span class="hljs-keyword">break</span>; <span class="hljs-keyword">case</span> <span class="hljs-string">"Nov"</span>:time[<span class="hljs-number">1</span>]=<span class="hljs-string">"11"</span>;<span class="hljs-keyword">break</span>; <span class="hljs-keyword">case</span>
<span class="hljs-string">"Dec"</span>:time[<span class="hljs-number">1</span>]=<span class="hljs-string">"12"</span>;<span class="hljs-keyword">break</span>; }
data[<span class="hljs-number">1</span>]=time[<span class="hljs-number">2</span>]+<span class="hljs-string">"-"</span>+time[<span class="hljs-number">1</span>]+<span class="hljs-string">"-"</span>+time[<span class="hljs-number">0</span>]+<span class="hljs-string">" "</span>+time[<span class="hljs-number">3</span>]+<span class="hljs-string">":"</span>+time[<span class="hljs-number">4</span>]+<span class="hljs-string">":"</span>+time[<span class="hljs-number">5</span>];
data[<span class="hljs-number">3</span>]=data[<span class="hljs-number">3</span>].substring(<span class="hljs-number">0</span>, data[<span class="hljs-number">3</span>].length()-<span class="hljs-number">1</span>);
<span class="hljs-keyword">return</span> data;
}
<span class="hljs-keyword">private</span> <span class="hljs-keyword">static</span> String [] change(String row) {
<span class="hljs-type">char</span> [] str1=row.toCharArray();
String [] data =<span class="hljs-keyword">new</span> <span class="hljs-title class_">String</span> [<span class="hljs-number">6</span>];
<span class="hljs-type">int</span> j=<span class="hljs-number">0</span>;
<span class="hljs-type">int</span> k=<span class="hljs-number">0</span>;
<span class="hljs-keyword">for</span>(<span class="hljs-type">int</span> i=<span class="hljs-number">0</span>;i<str1.length;i++) {
<span class="hljs-keyword">if</span>(str1[i]==<span class="hljs-string">','</span>) {
data[k]=row.substring(j, i);
j=i+<span class="hljs-number">1</span>;
k++;
}
}
data[k]=row.substring(j, str1.length);
<span class="hljs-keyword">return</span> data;
}
}
运行结果: