大数据测试之日志分析1

Result 文件数据说明:

Ip:106.39.41.166,(城市)

Date:10/Nov/2016:00:01:02 +0800,(日期)

Day:10,(天数)

Traffic: 54 ,(流量)

Type: video,(类型:视频 video 或文章 article)

Id: 8701(视频或者文章的 id)

测试要求:

1、 数据清洗:按照进行数据清洗,并将清洗后的数据导入 MongDB 数据库中。

两阶段数据清洗:

(1)第一阶段:把需要的信息从原始日志中提取出来

ip:    199.30.25.88

time:  10/Nov/2016:00:01:03 +0800

traffic:  62

文章: article/11325

视频: video/3235

(2)第二阶段:根据提取出来的信息做精细化操作

ip---> 城市 city(IP)

date--> time:2016-11-10 00:01:03

day: 10

traffic:62

type:article/video

id:11325

(3)MongDB 数据库表结构:

create table data( ip string,  time string , day string, traffic bigint,type string, id   string) 

清洗数据代码:

package mongotest3;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.io.Writer;

public class CleanData {

<span class="hljs-keyword">public</span> <span class="hljs-keyword">static</span> <span class="hljs-keyword">void</span> <span class="hljs-title function_">main</span><span class="hljs-params">(String[] args)</span> <span class="hljs-keyword">throws</span> FileNotFoundException, UnsupportedEncodingException {
     <span class="hljs-type">FileReader</span> <span class="hljs-variable">read</span> <span class="hljs-operator">=</span> <span class="hljs-keyword">new</span> <span class="hljs-title class_">FileReader</span>(<span class="hljs-string">"D:\\java\\eclipse-workplace\\mongotest3\\src\\mongotest3\\result.txt"</span>);
       <span class="hljs-type">BufferedReader</span> <span class="hljs-variable">br</span> <span class="hljs-operator">=</span> <span class="hljs-keyword">new</span> <span class="hljs-title class_">BufferedReader</span>(read);
       <span class="hljs-type">Writer</span> <span class="hljs-variable">writer</span> <span class="hljs-operator">=</span> <span class="hljs-literal">null</span>;
       <span class="hljs-type">File</span> <span class="hljs-variable">outFile</span> <span class="hljs-operator">=</span> <span class="hljs-keyword">new</span> <span class="hljs-title class_">File</span>(<span class="hljs-string">"D:\\result2.txt"</span>);
       writer = <span class="hljs-keyword">new</span> <span class="hljs-title class_">OutputStreamWriter</span>(<span class="hljs-keyword">new</span> <span class="hljs-title class_">FileOutputStream</span>(outFile),<span class="hljs-string">"utf-8"</span>);
       <span class="hljs-type">BufferedWriter</span> <span class="hljs-variable">bw</span> <span class="hljs-operator">=</span> <span class="hljs-keyword">new</span> <span class="hljs-title class_">BufferedWriter</span>(writer);
       String row;
       String[] data=<span class="hljs-keyword">new</span> <span class="hljs-title class_">String</span>[<span class="hljs-number">6</span>];
       <span class="hljs-type">int</span> hang=<span class="hljs-number">1</span>;
       <span class="hljs-keyword">try</span> {
        <span class="hljs-keyword">while</span>((row = br.readLine())!=<span class="hljs-literal">null</span>){
                data=change(row);
                data=chage(data);
                <span class="hljs-keyword">for</span>(<span class="hljs-type">int</span> i=<span class="hljs-number">0</span>;i&lt;data.length;i++) {
                    System.out.print(data[i]+<span class="hljs-string">"\t"</span>);
                }
                System.out.println();
                row=data[<span class="hljs-number">0</span>]+<span class="hljs-string">","</span>+data[<span class="hljs-number">1</span>]+<span class="hljs-string">","</span>+data[<span class="hljs-number">2</span>]+<span class="hljs-string">","</span>+data[<span class="hljs-number">3</span>]+<span class="hljs-string">","</span>+data[<span class="hljs-number">4</span>]+<span class="hljs-string">","</span>+data[<span class="hljs-number">5</span>];
                bw.write(row + <span class="hljs-string">"\r\n"</span>);
                <span class="hljs-comment">//i++;</span>
               }
    } <span class="hljs-keyword">catch</span> (IOException e) {
        <span class="hljs-comment">// TODO Auto-generated catch block</span>
        e.printStackTrace();
    }

}

<span class="hljs-keyword">private</span> <span class="hljs-keyword">static</span> String[] chage(String[] data) {
    <span class="hljs-comment">/*
     * for(int i=0;i&lt;data.length;i++) { data[] }
     */</span>
    data[<span class="hljs-number">0</span>]=data[<span class="hljs-number">0</span>];
    <span class="hljs-type">char</span>[] str=data[<span class="hljs-number">1</span>].toCharArray();
    String[] time=<span class="hljs-keyword">new</span> <span class="hljs-title class_">String</span>[<span class="hljs-number">7</span>];
    <span class="hljs-type">int</span> j=<span class="hljs-number">0</span>;
    <span class="hljs-type">int</span> k=<span class="hljs-number">0</span>;
    <span class="hljs-keyword">for</span>(<span class="hljs-type">int</span> i=<span class="hljs-number">0</span>;i&lt;str.length;i++) {
        <span class="hljs-keyword">if</span>(str[i]==<span class="hljs-string">'/'</span>||str[i]==<span class="hljs-string">':'</span>||str[i]==<span class="hljs-number">32</span>) {
            time[k]=data[<span class="hljs-number">1</span>].substring(j,i);
            j=i+<span class="hljs-number">1</span>;
            k++;
        }
    }
    time[k]=data[<span class="hljs-number">1</span>].substring(j, data[<span class="hljs-number">1</span>].length());
    
     <span class="hljs-keyword">switch</span>(time[<span class="hljs-number">1</span>]) { <span class="hljs-keyword">case</span> <span class="hljs-string">"Jan"</span>:time[<span class="hljs-number">1</span>]=<span class="hljs-string">"01"</span>;<span class="hljs-keyword">break</span>; <span class="hljs-keyword">case</span>
      <span class="hljs-string">"Feb"</span>:time[<span class="hljs-number">1</span>]=<span class="hljs-string">"02"</span>;<span class="hljs-keyword">break</span>; <span class="hljs-keyword">case</span> <span class="hljs-string">"Mar"</span>:time[<span class="hljs-number">1</span>]=<span class="hljs-string">"03"</span>;<span class="hljs-keyword">break</span>; <span class="hljs-keyword">case</span>
      <span class="hljs-string">"Apr"</span>:time[<span class="hljs-number">1</span>]=<span class="hljs-string">"04"</span>;<span class="hljs-keyword">break</span>; <span class="hljs-keyword">case</span> <span class="hljs-string">"May"</span>:time[<span class="hljs-number">1</span>]=<span class="hljs-string">"05"</span>;<span class="hljs-keyword">break</span>; <span class="hljs-keyword">case</span>
      <span class="hljs-string">"Jun"</span>:time[<span class="hljs-number">1</span>]=<span class="hljs-string">"06"</span>;<span class="hljs-keyword">break</span>; <span class="hljs-keyword">case</span> <span class="hljs-string">"Jul"</span>:time[<span class="hljs-number">1</span>]=<span class="hljs-string">"07"</span>;<span class="hljs-keyword">break</span>; <span class="hljs-keyword">case</span>
      <span class="hljs-string">"Aug"</span>:time[<span class="hljs-number">1</span>]=<span class="hljs-string">"08"</span>;<span class="hljs-keyword">break</span>; <span class="hljs-keyword">case</span> <span class="hljs-string">"Sep"</span>:time[<span class="hljs-number">1</span>]=<span class="hljs-string">"09"</span>;<span class="hljs-keyword">break</span>; <span class="hljs-keyword">case</span>
      <span class="hljs-string">"Oct"</span>:time[<span class="hljs-number">1</span>]=<span class="hljs-string">"10"</span>;<span class="hljs-keyword">break</span>; <span class="hljs-keyword">case</span> <span class="hljs-string">"Nov"</span>:time[<span class="hljs-number">1</span>]=<span class="hljs-string">"11"</span>;<span class="hljs-keyword">break</span>; <span class="hljs-keyword">case</span>
      <span class="hljs-string">"Dec"</span>:time[<span class="hljs-number">1</span>]=<span class="hljs-string">"12"</span>;<span class="hljs-keyword">break</span>; }
     
    data[<span class="hljs-number">1</span>]=time[<span class="hljs-number">2</span>]+<span class="hljs-string">"-"</span>+time[<span class="hljs-number">1</span>]+<span class="hljs-string">"-"</span>+time[<span class="hljs-number">0</span>]+<span class="hljs-string">" "</span>+time[<span class="hljs-number">3</span>]+<span class="hljs-string">":"</span>+time[<span class="hljs-number">4</span>]+<span class="hljs-string">":"</span>+time[<span class="hljs-number">5</span>];
    data[<span class="hljs-number">3</span>]=data[<span class="hljs-number">3</span>].substring(<span class="hljs-number">0</span>, data[<span class="hljs-number">3</span>].length()-<span class="hljs-number">1</span>);
    <span class="hljs-keyword">return</span> data;
}

<span class="hljs-keyword">private</span> <span class="hljs-keyword">static</span> String [] change(String row) {
    <span class="hljs-type">char</span> [] str1=row.toCharArray();
    String [] data =<span class="hljs-keyword">new</span> <span class="hljs-title class_">String</span> [<span class="hljs-number">6</span>];    
    <span class="hljs-type">int</span> j=<span class="hljs-number">0</span>;
    <span class="hljs-type">int</span> k=<span class="hljs-number">0</span>;
    <span class="hljs-keyword">for</span>(<span class="hljs-type">int</span> i=<span class="hljs-number">0</span>;i&lt;str1.length;i++) {
        <span class="hljs-keyword">if</span>(str1[i]==<span class="hljs-string">','</span>) {
            data[k]=row.substring(j, i);
            j=i+<span class="hljs-number">1</span>;
            k++;
        }    
    }
    data[k]=row.substring(j, str1.length);
    <span class="hljs-keyword">return</span> data;
}

}

运行结果: