빅데이터 분석기사 실기 4회

2024-11-28 1 분 소요

1유형

1번

import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/krdatacertificate/e4_p1_1.csv')
df.head()

# Temperature컬럼에서 숫자가 아닌 문자들을 제거후 
# 숫자 타입으로 바꾸고 
# 3분위수에서 1분위수의 차이를 
# 소숫점 이하 2자리까지 구하여라
df["Temperature_f"] = df["Temperature"].str.replace('*', '', regex = True).astype("float")
result = round(df["Temperature_f"].quantile(0.75) - df["Temperature_f"].quantile(0.25), 2)
result

2번

import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/krdatacertificate/e4_p1_2.csv')
df.head(5)

# Likes를 Comments로 나눈 비율이 20이상이면서 Keyword값이 minecraft인 영상들의 Views값의 평균을 정수로 구하여라
condition = (df["Likes"] / df["Comments"] >= 20) & (df["Keyword"] == "minecraft")
result = df[condition]["Views"].mean().astype(int)
result

3번

import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/krdatacertificate/e4_p1_3.csv')
df.head(5)

df[(df["date_added"].dt.strftime("%Y-%m") == "2018-01") & (df.country == "United Kingdom")].shape[0]

2유형

1번

##### 2유형 #####
import pandas as pd
train = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/krdatacertificate/e4_p2_train.csv')
test = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/krdatacertificate/e4_p2_test.csv')

display(train.head(2))
test.head(2)

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score


x = train.drop(columns = ["Segmentation"])
y = train["Segmentation"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

x_train = pd.get_dummies(x_train)
x_test = pd.get_dummies(x_test)
test = pd.get_dummies(test)

# x_train, x_test = x_train.align(x_test, join = "left", axis = 1, fill_value = 0)
# x_train, test = x_train.align(test, join = "left", axis = 1, fill_value = 0)

rf = RandomForestClassifier()
rf.fit(x_train, y_train)

x_test = pd.get_dummies(x_test)

# 검증 데이터 평가
pred = rf.predict(x_test)
result = f1_score(y_test, pred, average = "macro")
print(result)

test_pred = rf.predict(test)
test_pred

3유형

1번

# 어느 대학교의 신입생의 학과별 성별에 대한 데이터이다.
# 데이터를 바탕으로, 학생들의 학과와 성별이 서로 독립적인지 여부를 확인하기 위해 
# 카이제곱 독립성 검정을 실시 하려한다.

import pandas as pd 
df= pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/krdatacertificate/e4_p3_1_.csv')
df.head()

# 학과 평균 인원에 대한 값을 소숫점 이하 3자리까지 구하여라
round(df.groupby(["학과"]).size().mean(), 3)

2번

# 카이제곱검정 독립성 검정 통계량을 소숫점 이하 3자리까지 구하여라
from scipy.stats import chi2_contingency
crosstab = pd.crosstab(df["학과"], df["성별"])
s, p, _, _ = chi2_contingency(crosstab)

round_s = round(s, 3)
round_s

3번

# 카이제곱검정 독립성 검정의 pvalue를 소숫점 이하 3자리까지 구하여라. 유의수준 0.05하에서 귀무가설과 대립가설중 유의한 것을 출력하라
print(round(p, 3)) # 귀무가설 채택
print("귀무가설")

4번

# 어느 학교에서 수학 시험을 본 학생 100명 중 60명이 60점 이상을 받았다. 
# 이 학교의 수학 시험의 평균 점수가 50점 이상인지 
# 95%의 신뢰 수준에서 검정하려한다
import numpy as np
n = 100
p = 0.6
p_hat = 0.5
alpha = 0.05

z = round((p - p_hat) / np.sqrt(p * (1 - p) / n), 5)
z

5번

# pvalue를 소숫점 이하 3자리까지 구하고 귀무가설과 대립가설중 유의한 것을 출력하라

from scipy.stats import norm
p_value = round(1 - norm.cdf(z),3)

print(p_value)
print('대립')

Twitter Facebook LinkedIn

빅데이터 분석기사 실기 4회

1유형

1번

2번

3번

2유형

1번

3유형

1번

2번

3번

4번

5번

공유하기

댓글남기기

참고

1/20

1/16 cv2라이브러리 설치

12/20 자바 코테

12/20 SQL 문제