지식노동자의 노트

사례기반추론 SAS Coding 본문

데이타마이닝

사례기반추론 SAS Coding

생각하는너구리 2012. 7. 2. 23:13
728x90

/***************************************************************************************/

/*

Case based Reasoning

전형적인 K=10, K-NN CBR을 SAS 로 Coding하였습니다.

1) Old Case read

2) New Case read

3) Compute Distance Between Old & New

4) Compute Similarity

4) Select Most Similar 10 Cases

5) Predict New Case's target variable with probability


*/

/***************************************************************************************/

options symbolgen mlogic mprint validvarname=any;

     

/*1) OLD Case Read*/

data _old_cases;

        input id        n1      n2      n3      n4      n5      n6      c1$     c2 $ target ;

datalines;

1       0.00    0.23    0.23    0.54    0.04    0.00    f       1       0

2       0.00    0.38    0.38    0.54    0.06    0.06    m       1       0

3       0.00    0.08    0.08    0.54    0.48    0.04    f       1       0

4       0.25    0.23    0.38    0.50    0.34    0.09    f       1       0

5       0.00    1.00    1.00    0.54    0.33    0.01    m       1       0

6       0.00    1.00    1.00    0.54    0.33    0.01    f       1       0

7       0.25    0.46    0.46    0.53    0.19    0.08    f       1       1

8       0.00    1.00    1.00    0.65    0.19    0.00    m       1       0

9       0.00    0.62    0.62    0.53    0.19    0.00    m       1       0

10      0.00    0.62    0.69    0.54    0.32    0.10    m       1       0

11      0.00    0.23    0.23    0.61    0.04    0.07    f       1       0

12      0.00    0.08    0.08    0.52    0.04    0.10    f       1       0

13      0.00    0.08    0.08    0.59    0.02    0.13    f       1       0

14      0.25    0.31    0.38    0.51    0.29    0.07    m       1       0

15      0.00    0.46    0.46    0.54    0.28    0.02    f       1       0

16      0.25    0.23    0.46    0.54    0.29    0.08    m       1       0

17      0.00    0.92    0.92    0.50    0.15    0.04    m       1       0

18      0.25    0.00    0.00    0.54    0.01    0.01    f       0       1

19      0.00    0.08    0.08    0.53    0.01    0.14    f       1       0

20      0.00    0.00    0.00    0.51    0.00    0.05    f       0       0

21      0.25    0.15    0.38    0.57    0.14    0.08    f       1       0

22      0.50    0.00    0.00    0.57    0.54    0.11    f       0       1

23      0.50    0.46    0.54    0.54    0.54    0.11    f       0       0

24      0.50    0.15    0.31    0.53    0.12    0.08    f       1       1

25      0.00    1.00    1.00    0.54    0.25    0.05    f       1       0

26      0.00    0.23    0.23    0.57    0.09    0.03    f       1       0

27      0.00    0.23    0.31    0.54    0.11    0.03    f       0       0

28      0.00    0.62    0.62    0.55    0.09    0.10    m       1       0

29      0.25    0.38    0.46    0.54    0.35    0.10    f       1       0

30      0.25    0.08    0.15    0.54    0.35    0.05    m       0       0

;

run;


/*2) New Case Read*/

data _New_cases;

        input id        n1      n2      n3      n4      n5      n6      c1$     c2 $ target ;

datalines;

1       0.00    0.23    0.23    0.54    0.04    0.00    f       1       0

2       0.00    0.38    0.38    0.54    0.06    0.06    m       1       0

3       0.00    0.08    0.08    0.54    0.48    0.04    f       1       0

4       0.25    0.23    0.38    0.50    0.34    0.09    f       1       0

5       0.00    1.00    1.00    0.54    0.33    0.01    m       1       0

6       0.00    1.00    1.00    0.54    0.33    0.01    f       1       0

7       0.25    0.46    0.46    0.53    0.19    0.08    f       1       1

8       0.00    1.00    1.00    0.65    0.19    0.00    m       1       0

9       0.00    0.62    0.62    0.53    0.19    0.00    m       1       0

10      0.00    0.62    0.69    0.54    0.32    0.10    m       1       0

;

run;


data _New_cases;

        set _New_cases;

        rename

  c1 =new_c1

  c2 =new_c2

  id =new_id

  n1 =new_n1

  n2 =new_n2

  n3 =new_n3

  n4 =new_n4

  n5 =new_n5

  n6 =new_n6

  target =new_target

;

run;




%macro computeSimilarity(start=, end= );


%do i = &start. %to &end.;


proc sql;

create table _tmpry00 as

select *

from _old_cases, _New_cases

where new_id = &i.;

quit;


data _tmpry01;

        set _tmpry00;


    wgt_n1 = 0.05;

        wgt_n2 = 0.05;

        wgt_n3 = 0.15;

        wgt_n4 = 0.15;

        wgt_n5 = 0.20;

        wgt_n6 = 0.15;

        wgt_c1 = 0.10;

        wgt_c2 = 0.15;


        if c1 = new_c1 then dst_c1 = 1;

        else dst_c1 = 0;

        if c2 = new_c2 then dst_c2 = 1;

        else dst_c2 = 0;


        distance = sqrt(

                 +  wgt_n1*(n1 - new_n1)**2

                 +  wgt_n2*(n2 - new_n2)**2

                 +  wgt_n3*(n3 - new_n3)**2

                 +  wgt_n4*(n4 - new_n4)**2

                 +  wgt_n5*(n5 - new_n5)**2

                 +  wgt_n6*(n6 - new_n6)**2

                 +  wgt_c1*dst_c1

                 +  wgt_c2*dst_c2

                  );

        similarity  = 1 / (1 + distance);

run;


proc sort data = _tmpry01;

by descending similarity  id;

run;


data _tmpry02;

        set _tmpry01;

        by descending similarity  id;

        ranking + 1;

run;


data _tmpry03;

        set _tmpry02;

        where ranking <= 10;

run;


proc sql;

create table _rslt as

select

new_id

, new_target

, sum(target) as cnt_y

, count(*)  as cnt

, sum(target) / count(*) as probability

from  _tmpry03

group by new_id , new_target;

quit;


%if &i. = 1 %then %do;

                data _rslt_all;

                        set _rslt;

                run;

%end;

%else %do;

                data _rslt_all;

                        set _rslt_all _rslt;

                run;

%end;


proc datasets lib=work nolist;

delete _tmpry00 - _tmpry03;

quit;

run;


%end;

%mend computeSimilarity;


%computeSimilarity(start=1, end=10)


proc freq data = _rslt_all;

table probability * new_target;

run;

반응형

'데이타마이닝' 카테고리의 다른 글

Weka 예제 파일(arff) 위치  (0) 2015.08.06
R package malformed error  (0) 2015.06.11
R 관련 서적 모음  (0) 2015.01.08
Weka실습 Decision Tree  (6) 2012.06.05
무료 데이타마이닝툴 WEKA(웨카)  (0) 2012.05.22
Comments