Ukrainian Honeypot ::004:: Chinese Established Sessions Map

See Also

https://bcable.net/analysis-ukr-prelim.html

https://bcable.net/analysis-ukr-graphs.html

https://bcable.net/analysis-ukr-indicators.html

https://bcable.net/analysis-ukr-ru_map_sessions

Last Updated

Fri May 13 19:58:23 2022

Packages

library(Rwhois)
library(ggplot2)
library(sf)
## Linking to GEOS 3.9.2, GDAL 3.3.3, PROJ 8.2.1; sf_use_s2() is TRUE
library(wk)

Load Data

https://gadm.org/data.html

china <- read_sf("redacted/geomaps/cn/gadm40_CHN_1.shp")

https://www.downloadexcelfiles.com/cn_en/download-excel-file-list-provinces-china

provinces_list <- read.csv(
    "redacted/geomaps/cn/list_of_provinces_of_china-70j.csv"
)
province_names <- provinces_list$Province[provinces_list$Province != ""]
chinese_hosts <- read.csv("chinese_hosts.csv")

Province Mappings

cn_map_cleanup_list <- list(
    c(" (municipality|province|autonomous region|special administrative region)", ""),
    c("[^a-z]", "")
)
cn_mirror_map <- list(
    c("guangxi", "guangxizhuang"),
    c("ningxia", "ningxiahui"),
    c("innermongolia", "neimongol"),
    c("xinjiang", "xinjianuygur")
)
cn_mirror_geo <- NULL

Functions

https://bcable.net/x/Rproj/shared

source("shared/geo_provinces.R")

WHOIS Cache Update

chinese_hosts_geo.csv

if(!file.exists("chinese_hosts_geo.csv")){
    chinese_hosts_whois <- Rwhois::whois_query(chinese_hosts$remote_host)
    ret_provinces <- sapply(
        chinese_hosts_whois, FUN=function(x){ find_province(
            x$val,
            province_names, cn_map_cleanup_list, cn_mirror_map, cn_mirror_geo
        ) }
    )
    chinese_hosts_geo <- chinese_hosts
    chinese_hosts_geo$province <- ret_provinces
    write.csv(chinese_hosts_geo, "chinese_hosts_geo.csv", row.names=FALSE)

} else {
    chinese_hosts_geo <- read.csv("chinese_hosts_geo.csv")

    if(!file.exists("chinese_hosts_geo_new.csv")){
        chinese_hosts_new <- chinese_hosts[
            !(chinese_hosts$remote_host %in% chinese_hosts_geo$remote_host),
        ]
        chinese_hosts_new_whois <- Rwhois::whois_query(
            chinese_hosts_new$remote_host
        )
        ret_provinces <- sapply(
            chinese_hosts_new_whois, FUN=function(x){ find_province(
                x$val,
                province_names, cn_map_cleanup_list,
                cn_mirror_map, cn_mirror_geo
            ) }
        )
        chinese_hosts_new_geo <- chinese_hosts_new
        chinese_hosts_new_geo$province <- ret_provinces

        chinese_hosts_geo <- rbind(
            chinese_hosts_geo, chinese_hosts_new_geo
        )

        write.csv(chinese_hosts_geo, "chinese_hosts_geo_new.csv", row.names=FALSE)
    }
}
## Error in write.socket(conn, hostname): 'translateChar' must be called on a CHARSXP, but got 'NULL'

Clean Map Data

chinese_hosts_geo$merge.col <- cleanup_province(
    chinese_hosts_geo$province, cn_map_cleanup_list, cn_mirror_geo
)
china$merge.col <- cleanup_province(
    china$NAME_1, cn_map_cleanup_list, cn_mirror_map
)

Aggregate Provinces

agg_provinces <- aggregate(count ~ merge.col, data=chinese_hosts_geo, FUN=sum)
agg_provinces
##        merge.col count
## 1          anhui  3446
## 2        beijing 60407
## 3      chongqing   378
## 4         fujian  1362
## 5          gansu   253
## 6      guangdong 12985
## 7  guangxizhuang   651
## 8        guizhou   512
## 9         hainan   234
## 10         hebei  2644
## 11  heilongjiang   805
## 12         henan 14162
## 13      hongkong   960
## 14         hubei 32372
## 15         hunan   868
## 16 innermongolia   170
## 17       jiangsu 12634
## 18       jiangxi   848
## 19         jilin   443
## 20      liaoning  1378
## 21    ningxiahui    69
## 22       qinghai   137
## 23       shaanxi   407
## 24      shandong 11142
## 25      shanghai  6281
## 26        shanxi  2951
## 27       sichuan  3589
## 28        taiwan    25
## 29       tianjin   390
## 30         tibet    68
## 31        yunnan   345
## 32      zhejiang 27637

(Another bad map, Taiwan is not in China it's independent… oh well, the actual shape files don't include Taiwan so it doesn't matter as it doesn't show up)

Merge Map & Data

china_data <- merge(china, agg_provinces, by="merge.col", all.x=TRUE)

Map

g <- ggplot(china_data)
g <- g + labs(
    title="CO.UA Honeypot: Established Sessions by Chinese Region (From WHOIS Data)",
    fill="Sessions"
)
g <- g + scale_fill_viridis_c()
g <- g + geom_sf(aes(geometry=geometry, fill=count))
g <- g + theme_bw()
g <- g + theme(
    plot.margin = margin(0.2, 0.2, 0.2, 0.2, "cm")
)
g

plot of chunk map_china_sf

Biolerplate GeoIP Disclaimer

Geolocation based on IP address is not to be taken as entirely accurate as to the source of traffic or attacks conducted. There are many reasons for this, which include (but are not limited to):

Proxies, VPNs, and Tor

Large quantities of traffic, especially attack based traffic, will use a VPN or the Tor network (or some reasonable facsimile), to mask the origin of the traffic. This will in turn change the appearance of the location of origin. Usually, an attacker will also intentionally want the traffic to appear to come from somewhere that has some form of lesser legal jurisdiction, some form of lesser ability to police traffic, or come from a well known source of malicious attacks such as China or Russia.

For instance, the following log entry was generated by myself against my servers while sitting at my desk in the United States, but it gets geolocated as Russia because of how the packet was sent. This sort of masking is trivial to perform, even by a nine year old on a cellphone.

httpd_data[grep("/from/russia/with/logs", httpd_data$Request), c("Request", "Response.Code", "Country.Code")]

##                               Request Response.Code Country.Code
## 1 GET /from/russia/with/logs HTTP/1.1           404           RU

Vulnerable Servers and Botnets

Some locations will have a higher distribution of virtual servers than others, such as Silicon Valley or China. This can lead to larger quantities of vulnerable virtual machines and servers in those regions, and distort the resulting aggregate data.

Government Interference

It is possible that due to address assignment for governmental intelligence purposes or other economic or political reasons a nation could re-allocate address space and forge the identity similarly to a NAT (network address translation). They could also funnel information via VPN technologies for another nation.

Because most of these agreements are made in private, and due to the fact that most geolocation, RDAP, and WHOIS records are based on self-reporting, it is impossible to know the 100% true nature of geographic address assignment.

Weaknesses or errors in MaxMind, rgeolocate, RDAP, or WHOIS

This geolocation uses the rgeolocate package available in CRAN, and uses the internal country database that is shipped with it. There could be an error in the database shipped, there could be an error in the lookup code, etc. Bugs happen. I have no reason to believe that any false geolocation is being performed by these packages, however.

Also used is the self-reported RDAP or WHOIS systems which can frequently be self-reported falsely or misleadingly. Which of the systems (RDAP, WHOIS, or rgeolocate) used are disclosed when necessary.

Final Note

Despite these weaknesses, this doesn't change the fact that looking at this sort of data can be quite fun and interesting, and potentially enlightening. Generalized conclusions should not be made from this data or the maps herein. You have been warned.