scores分析文文档格式.docx
《scores分析文文档格式.docx》由会员分享,可在线阅读,更多相关《scores分析文文档格式.docx(10页珍藏版)》请在冰点文库上搜索。
-read.table("
scores.txt"
header=TRUE,row.names="
num"
head(scores)
str(scores)
#显示对象的结构
names(scores)
#显示每一列的名称
attach(scores)
#给出数据的概略信息
summary(scores)
summary(scores$math)
Min.1stQu.
Median
Mean3rdQu.
Max.
3.00
84.00
100.00
93.98
111.00
120.00
#1stQu.第一个4分位数
#选择某行
child<
-scores['
239'
]
sum(child)#求孩子的总分
[1]647.45
scores.class4<
-scores[class==4,]
#挑出4班的
#求每个班的平均数学成绩
aver<
-tapply(math,class,mean)
#画条曲线看看每个班的数学平均成绩
plot(aver,type='
b'
ylim=c(80,100),main="
各班数学成绩平均分"
xlab="
班级"
ylab="
数学平均分"
#生成数据的二维列联表
table(math,class)
class
math
12345678910
3
000000100
0
9
100000000
10
101000000
18
000101001
……………
#求4班每一科的平均成绩
subjects<
-c('
chn'
'
math'
eng'
phy'
chem'
politics'
bio'
history'
geo'
pe'
sapply(scores[class==4,subjects],mean)
chn
math
eng
phy
chempolitics
bio
history
geo
pe
83.1093897.2968885.6015654.3046934.6796942.4140641.7968836.7734444.2421954.31250
#求各班各科的平均成绩
aggregate(scores[subjects],by=list(class),mean)
Group.1chnmathengphychempoliticsbiohistorygeope
1182.9838792.8225892.4516156.0451634.9516142.5725842.2983937.0322643.4435554.12903
2281.5775993.1724185.0172454.3948334.6077643.1379342.0517238.5948343.6034554.68966
3382.6206988.5862182.4655251.5948332.3319041.9913841.5948335.4913842.9741454.55172
4483.1093897.2968885.6015654.3046934.6796942.4140641.7968836.7734444.2421954.31250
5584.7410797.8928683.6696456.1000033.9151842.0535742.5714337.7767943.9642954.00000
6683.1440792.4067878.5762751.7406833.3686440.6440741.5593234.4661043.3728853.22034
7783.0172490.2931087.0086251.7517233.9827641.6379342.5172437.4655244.2241453.72414
8883.6583398.6500086.9166756.0233336.0791741.7000042.4083337.8416744.8166752.93333
9983.2096894.3548486.4838754.2951636.1169441.9435542.7258136.0725844.3064553.48387
101084.3387194.0806586.6677455.0854836.0121041.8629042.2258136.7822644.1451653.61290
#看看数学成绩的分布图
hist(math)
默认是按频数形成的直方图,设置freq参数可以画密度分布图。
hist(math,freq=FALSE)
lines(density(math),col='
blue'
rug(jitter(math))
#轴须图,在轴旁边出现一些小线段,jitter是加噪函数
#核密度图
plot(density(chn),col='
lwd=2)
red'
text(locator
(2),c("
语文"
"
数学"
))
#用鼠标拾取点,加上文本标注
#箱线图
boxplot(math)
boxplot.stats(math)#这个函数可以看到画出箱线图的具体的数据值
[1]4484100111120
$n
[1]599
#有效样本点个数
$conf
[1]98.25696101.74304
$out
#离群值
[1]38423540433641403618263642324129182410203419103
[25]35203518229
#并列箱线图,看各班的数据分布情况
boxplot(math~class,data=scores)
lines(tapply(math,class,mean),col='
type='
)#加上平均值
可以看出2班没有拖后腿的,4班有6个拖后腿的
#看看各科成绩的相关性
#可以看出:
数学和物理的相关性达88%,物理和化学成绩的相关性达86%。
cor(scores[,subjects])
chem
politics
pe
chn
1.00000000.65881260.73267780.65781720.62711550.72570030.69022820.69711450.64386620.2712453
0.65881261.0000000
0.8079255
0.8860467
0.8304643
0.70906810.79519870.77327910.77238530.3300249
eng
0.73267780.80792551.0000000
0.8170998
0.78687100.74989460.77310440.79482190.72654060.3159347
phy
0.65781720.88604670.81709981.0000000
0.8615512
0.70817170.8077105
0.81005990.78141520.3251233
chem
0.62711550.83046430.7868710
1.00000000.64413340.75787700.79932980.72648140.2769066
politics0.72570030.70906810.74989460.70817170.64413341.00000000.70711810.71928600.69069300.3033607
bio
0.69022820.79519870.77310440.80771050.75787700.70711811.00000000.77717350.83825250.2428081
history
0.69711450.77327910.79482190.81005990.79932980.71928600.77717351.00000000.77310440.2708434
geo
0.64386620.77238530.72654060.78141520.72648140.69069300.83825250.77310441.00000000.2605251
pe
0.27124530.33002490.31593470.32512330.27690660.30336070.24280810.27084340.26052511.0000000
#画个图出来看看
pairs(scores[,subjects])
#详细看看数学和物理的线性相关性
cor_phy_math<
-lm(phy~math,scores)
plot(math,phy)
abline(cor_phy_math)
cor_phy_math
#也就是说拟合公式为:
phy=0.5258*math+4.7374,为什么是0.52?
因为数学最高分为120,物理最高分为70
Call:
lm(formula=phy~math,data=scores)
Coefficients:
(Intercept)
4.7374
0.5258