일 | 월 | 화 | 수 | 목 | 금 | 토 |
---|---|---|---|---|---|---|
1 | ||||||
2 | 3 | 4 | 5 | 6 | 7 | 8 |
9 | 10 | 11 | 12 | 13 | 14 | 15 |
16 | 17 | 18 | 19 | 20 | 21 | 22 |
23 | 24 | 25 | 26 | 27 | 28 |
- subset
- SAS 연산오류
- 리더쉽
- SAS 날짜
- 데이타마이닝 툴
- 얼굴합성앱추천
- Deepfake
- 기계학습
- 책모음
- 뉴스로 이해하는 베트남
- 딥페이크
- 토드부크홀츠
- R 프로그래밍
- 베트남삼겹살
- 사례기반추론
- FPT #베트남기업
- 베트남 #자동차 #베트남자동차보급율
- 오블완
- hdmi젠더
- 스틱pc
- 지방자치단체조합
- 티스토리챌린지
- r
- 일본 뜀틀 응원 기적 감동
- 웨카
- 강의자료
- SAS에서 한글사용
- SAS
- SAS 계산오류
- HDMI-DVI
- Today
- Total
지식노동자의 노트
사례기반추론 SAS Coding 본문
/***************************************************************************************/
/*
Case based Reasoning
전형적인 K=10, K-NN CBR을 SAS 로 Coding하였습니다.
1) Old Case read
2) New Case read
3) Compute Distance Between Old & New
4) Compute Similarity
4) Select Most Similar 10 Cases
5) Predict New Case's target variable with probability
*/
/***************************************************************************************/
options symbolgen mlogic mprint validvarname=any;
/*1) OLD Case Read*/
data _old_cases;
input id n1 n2 n3 n4 n5 n6 c1$ c2 $ target ;
datalines;
1 0.00 0.23 0.23 0.54 0.04 0.00 f 1 0
2 0.00 0.38 0.38 0.54 0.06 0.06 m 1 0
3 0.00 0.08 0.08 0.54 0.48 0.04 f 1 0
4 0.25 0.23 0.38 0.50 0.34 0.09 f 1 0
5 0.00 1.00 1.00 0.54 0.33 0.01 m 1 0
6 0.00 1.00 1.00 0.54 0.33 0.01 f 1 0
7 0.25 0.46 0.46 0.53 0.19 0.08 f 1 1
8 0.00 1.00 1.00 0.65 0.19 0.00 m 1 0
9 0.00 0.62 0.62 0.53 0.19 0.00 m 1 0
10 0.00 0.62 0.69 0.54 0.32 0.10 m 1 0
11 0.00 0.23 0.23 0.61 0.04 0.07 f 1 0
12 0.00 0.08 0.08 0.52 0.04 0.10 f 1 0
13 0.00 0.08 0.08 0.59 0.02 0.13 f 1 0
14 0.25 0.31 0.38 0.51 0.29 0.07 m 1 0
15 0.00 0.46 0.46 0.54 0.28 0.02 f 1 0
16 0.25 0.23 0.46 0.54 0.29 0.08 m 1 0
17 0.00 0.92 0.92 0.50 0.15 0.04 m 1 0
18 0.25 0.00 0.00 0.54 0.01 0.01 f 0 1
19 0.00 0.08 0.08 0.53 0.01 0.14 f 1 0
20 0.00 0.00 0.00 0.51 0.00 0.05 f 0 0
21 0.25 0.15 0.38 0.57 0.14 0.08 f 1 0
22 0.50 0.00 0.00 0.57 0.54 0.11 f 0 1
23 0.50 0.46 0.54 0.54 0.54 0.11 f 0 0
24 0.50 0.15 0.31 0.53 0.12 0.08 f 1 1
25 0.00 1.00 1.00 0.54 0.25 0.05 f 1 0
26 0.00 0.23 0.23 0.57 0.09 0.03 f 1 0
27 0.00 0.23 0.31 0.54 0.11 0.03 f 0 0
28 0.00 0.62 0.62 0.55 0.09 0.10 m 1 0
29 0.25 0.38 0.46 0.54 0.35 0.10 f 1 0
30 0.25 0.08 0.15 0.54 0.35 0.05 m 0 0
;
run;
/*2) New Case Read*/
data _New_cases;
input id n1 n2 n3 n4 n5 n6 c1$ c2 $ target ;
datalines;
1 0.00 0.23 0.23 0.54 0.04 0.00 f 1 0
2 0.00 0.38 0.38 0.54 0.06 0.06 m 1 0
3 0.00 0.08 0.08 0.54 0.48 0.04 f 1 0
4 0.25 0.23 0.38 0.50 0.34 0.09 f 1 0
5 0.00 1.00 1.00 0.54 0.33 0.01 m 1 0
6 0.00 1.00 1.00 0.54 0.33 0.01 f 1 0
7 0.25 0.46 0.46 0.53 0.19 0.08 f 1 1
8 0.00 1.00 1.00 0.65 0.19 0.00 m 1 0
9 0.00 0.62 0.62 0.53 0.19 0.00 m 1 0
10 0.00 0.62 0.69 0.54 0.32 0.10 m 1 0
;
run;
data _New_cases;
set _New_cases;
rename
c1 =new_c1
c2 =new_c2
id =new_id
n1 =new_n1
n2 =new_n2
n3 =new_n3
n4 =new_n4
n5 =new_n5
n6 =new_n6
target =new_target
;
run;
%macro computeSimilarity(start=, end= );
%do i = &start. %to &end.;
proc sql;
create table _tmpry00 as
select *
from _old_cases, _New_cases
where new_id = &i.;
quit;
data _tmpry01;
set _tmpry00;
wgt_n1 = 0.05;
wgt_n2 = 0.05;
wgt_n3 = 0.15;
wgt_n4 = 0.15;
wgt_n5 = 0.20;
wgt_n6 = 0.15;
wgt_c1 = 0.10;
wgt_c2 = 0.15;
if c1 = new_c1 then dst_c1 = 1;
else dst_c1 = 0;
if c2 = new_c2 then dst_c2 = 1;
else dst_c2 = 0;
distance = sqrt(
+ wgt_n1*(n1 - new_n1)**2
+ wgt_n2*(n2 - new_n2)**2
+ wgt_n3*(n3 - new_n3)**2
+ wgt_n4*(n4 - new_n4)**2
+ wgt_n5*(n5 - new_n5)**2
+ wgt_n6*(n6 - new_n6)**2
+ wgt_c1*dst_c1
+ wgt_c2*dst_c2
);
similarity = 1 / (1 + distance);
run;
proc sort data = _tmpry01;
by descending similarity id;
run;
data _tmpry02;
set _tmpry01;
by descending similarity id;
ranking + 1;
run;
data _tmpry03;
set _tmpry02;
where ranking <= 10;
run;
proc sql;
create table _rslt as
select
new_id
, new_target
, sum(target) as cnt_y
, count(*) as cnt
, sum(target) / count(*) as probability
from _tmpry03
group by new_id , new_target;
quit;
%if &i. = 1 %then %do;
data _rslt_all;
set _rslt;
run;
%end;
%else %do;
data _rslt_all;
set _rslt_all _rslt;
run;
%end;
proc datasets lib=work nolist;
delete _tmpry00 - _tmpry03;
quit;
run;
%end;
%mend computeSimilarity;
%computeSimilarity(start=1, end=10)
proc freq data = _rslt_all;
table probability * new_target;
run;
'데이타마이닝' 카테고리의 다른 글
Weka 예제 파일(arff) 위치 (0) | 2015.08.06 |
---|---|
R package malformed error (0) | 2015.06.11 |
R 관련 서적 모음 (0) | 2015.01.08 |
Weka실습 Decision Tree (6) | 2012.06.05 |
무료 데이타마이닝툴 WEKA(웨카) (0) | 2012.05.22 |