-
Notifications
You must be signed in to change notification settings - Fork 0
/
syscallAnalysis.R
167 lines (151 loc) · 5.57 KB
/
syscallAnalysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
rm(list = ls())
#lab
setwd("/Users/jing/Desktop/class/ECS251/Android-syscall-analysis")
library(rjson)
library(tidytext)
library(dplyr)
library(tidyr)
library(stringr)
library(igraph)
library(ggplot2)
library(ggraph)
##load data.frame
load("benign_sysc.Rda")
load("malicious_sysc.Rda")
## load benign data
# filePath_b <- "../data/benign"
# temp_b <- list.files(filePath_b, pattern="*.json", full.names=TRUE)
# names_b <- str_extract(temp_b, "benign_...")
# data_temp <- fromJSON(paste(readLines(temp_b[1]), collapse = ""))
# syscalls <- unlist(data_temp, use.names = FALSE)
# df_benign <- data.frame(text = c(syscalls), book = names_b[1])
# for(i in 2:length(temp_b)) {
# data_temp <- fromJSON(paste(readLines(temp_b[i]), collapse = ""))
# syscalls <- unlist(data_temp, use.names = FALSE)
# temp_df <- data.frame(text = syscalls, book = names_b[i])
# df_benign <- rbind(df_benign, temp_df)
# }
##save data.frame
#save(df_benign,file="benign_sysc.Rda")
## load malicious data
# filePath_m <- "../data/malicious"
# temp_m <- list.files(filePath_m, pattern="*.json", full.names=TRUE)
# names_m <- str_extract(temp_m, "malicious_...")
# data_temp <- fromJSON(paste(readLines(temp_m[1]), collapse = ""))
# syscalls <- unlist(data_temp, use.names = FALSE)
# df_malicious <- data.frame(text = c(syscalls), book = names_m[1])
# for(i in 2:length(temp_m)) {
# data_temp <- fromJSON(paste(readLines(temp_m[i]), collapse = ""))
# syscalls <- unlist(data_temp, use.names = FALSE)
# temp_df <- data.frame(text = syscalls, book = names_m[i])
# df_malicious <- rbind(df_malicious, temp_df)
# }
# save(df_malicious,file="malicious_sysc.Rda")
filePath <- "data"
temp <- list.files(filePath, pattern="*.json", full.names=TRUE)
data_temp <- fromJSON(paste(readLines(temp[1]), collapse = ""))
syscalls <- unlist(data_temp, use.names = FALSE)
names <- c("messenger", "momentWeChat", "MsgWeChat", "whatsapp")
df_apps <- data.frame(text = c(syscalls), book = names[1])
for(i in 2:length(names)) {
data_temp <- fromJSON(paste(readLines(temp[i]), collapse = ""))
syscalls <- unlist(data_temp, use.names = FALSE)
temp_df <- data.frame(text = syscalls, book = names[i])
df_apps <- rbind(df_apps, temp_df)
}
# bigram_benign <- df_benign[df_benign$book == names_b[10],] %>% unnest_tokens(bigram, text, token = "ngrams", n = 2)
bigram_benign <- df_benign %>% unnest_tokens(bigram, text, token = "ngrams", n = 2)
bigrams_sep <- bigram_benign %>% separate(bigram, c("word1", "word2", "word3"), sep = " ")
bigram_malicious <- df_malicious %>% unnest_tokens(bigram, text, token = "ngrams", n = 2)
bigrams_sep <- bigram_malicious %>% separate(bigram, c("word1", "word2"), sep = " ")
bigram_filter <- bigrams_sep %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
bigram_count <-bigram_filter %>%
count(word1, word2, sort = TRUE)
bigrams_united <- bigram_filter %>%
unite(bigram, word1, word2, sep = " ")
bigram_tf_idf <- bigrams_united %>%
count(book, bigram) %>%
bind_tf_idf(bigram, book, n) %>%
arrange(desc(tf_idf))
bigram_tf_idf
bigram_graph <- bigram_count %>%
filter(n > 500) %>%
graph_from_data_frame()
set.seed(2016)
a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
ggraph(bigram_graph, layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
arrow = a, end_cap = circle(.07, 'inches')) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
theme_void()
set.seed(2017)
ggraph(bigram_graph, layout = "fr") +
geom_edge_link() +
geom_node_point() +
geom_node_text(aes(label = name), vjust = 1, hjust = 1)
x <- subset(bigram_tf_idf, select = c(bigram, tf_idf))
# for(name in names_b) {
# bigram_temp <- df_benign[df_benign$book == name,] %>% unnest_tokens(bigram, text, token = "ngrams", n = 2)
# bigram_benign <- unite()
# }
#
#
# json_file <- "/Users/jingli/Desktop/classes/ECS251/project/benign/benign_11.json"
# json_file_2 <- "/Users/jingli/Desktop/classes/ECS251/project/benign/benign_15.json"
# json_data_1 <- fromJSON(paste(readLines(json_file), collapse = ""))
# json_data_2 <- fromJSON(paste(readLines(json_file_2), collapse = ""))
#
# data1 <- unlist(json_data_1, use.names = FALSE)
# data2 <- unlist(json_data_2, use.names = FALSE)
# df <- data_frame(text = data, book = "benign_11")
# df_book <- data_frame(book = data1)
#
# mySyscalls <- df %>% unnest_tokens(bigram, text, token = "ngrams", n = 2)
#
# mySyscalls_f <- mySyscalls %>% count(bigram, sort = TRUE)
# mySyscalls_united <- data.frame(book = "a", bigram = mySyscalls_f$bigram)
# # mySyscalls_sep <- mySyscalls_f %>% separate(bigram, c("word1", "word2"), sep = " ")
# # mySyscalls_united <- mySyscalls_sep %>% unite(bigram, word1, word2, sep = " ")
#
# mySyscalls_tf_idf <- mySyscalls_united %>%
# count(book, bigram) %>%
# bind_tf_idf(bigram, book, n) %>%
# arrange(desc(tf_idf))
#
#
#
# austen_bigrams <- austen_books() %>%
# unnest_tokens(bigram, text, token = "ngrams", n = 2)
#
# austen_bigrams
#
# bigrams_separated <- austen_bigrams %>%
# separate(bigram, c("word1", "word2"), sep = " ")
#
# bigrams_filtered <- bigrams_separated %>%
# filter(!word1 %in% stop_words$word) %>%
# filter(!word2 %in% stop_words$word)
#
# # new bigram counts:
# bigram_counts <- bigrams_filtered %>%
# count(word1, word2, sort = TRUE)
#
# bigram_counts
#
#
# bigrams_united <- bigrams_filtered %>%
# unite(bigram, word1, word2, sep = " ")
#
# bigrams_united
#
# bigram_tf_idf <- bigrams_united %>%
# count(book, bigram) %>%
# bind_tf_idf(bigram, book, n) %>%
# arrange(desc(tf_idf))
#
# bigram_tf_idf
#
#