Ukrainian Honeypot ::003:: Russian Established Sessions Map

Last Updated

Sun Sep 4 00:04:53 2022

Packages

library(Rwhois)
library(ggplot2)
library(sf)

## Linking to GEOS 3.9.2, GDAL 3.3.3, PROJ 8.2.1; sf_use_s2() is TRUE

library(wk)

Load Data

https://mydata.biz/ru/catalog/databases/borders_ru

russia <- read.csv("redacted/geomaps/ru/admin_level_4.csv", sep=";")

http://www.diva-gis.org/gdata

rus_adm3 <- read.csv("redacted/geomaps/ru/RUS_adm3.csv")

https://gadm.org/data.html

#!/bin/bash

head -n 1 ../graphs/dionaea_connections_geo.csv > russian_hosts.csv
grep -h RU \
    ../graphs/cowrie_sessions_geo.csv \
    ../graphs/dionaea_connections_geo.csv \
    >> russian_hosts.csv

russian_hosts <- read.csv("russian_hosts.csv")

Province Mappings

(Province? Oblast? Okrug? Ugra? Krai? I'm just going with Province because I'm a dumbass)

These are the rules that get fed into the functions below for extra parsing, cleaning, and analogous province names that won't be found in the dataset for rus_adm3$NAME_1. These rules are used for merging with admin_level_4.csv's russia$name__en and also used to mix and match search for WHOIS records to get as many matches as possible. Kludgy, but works. How many false positives (comes up with a street name that is a province inside of another city elsewhere), and false negatives (misspellings, things I didn't know about/catch, WHOIS records that are incorrect or incomplete) are unclear.

ru_map_cleanup_list <- list(
    c(" (oblast|krai|republic|autonomous okrug|ugra)", ""),
    c("(autonomous|republic) of ", ""),
    c("[^a-z]", ""),
    c("cityof|city", ""),
    c("saintpetersburg", "petersburg"),
    c("stpetersburg", "petersburg"),
    c("gornoaltay", "altay"),
    c("gornoaltai", "altay")
)

ru_mirror_map <- list(
    c("adygea", "adygey"),
    c("altai", "altay"),
    c("buryatia", "buryat"),
    c("chukotka", "chukot"),
    c("chuvashia", "chuvash"),
    c("inguishetia", "ingush"),
    c("jewishautonomous", "yevrey"),
    c("kabardinobalkaria", "kabardinbalkar"),
    c("kalmykia", "kalmyk"),
    c("karachaycherkessia", "karachaycherkess"),
    c("khantymansiysk", "khantymansiy"),
    c("khakassia", "khakass"),
    c("magadan", "magaburyatdan"),
    c("mariel", "mariyel"),
    c("northossetiaalania", "northossetia"),
    c("nizhnynovgorod", "nizhegorod"),
    c("oryol", "orel"),
    c("primorsky", "primorye"),
    c("udmurtia", "udmert"),
    c("yamalonenets", "yamalnenets"),
    c("zabaykalsky", "zabaykalye")
)

ru_mirror_geo <- list(
    c("moskva", "moscow"),
    c("moscowcity", "moscow")
)

Functions

The code below looks for the first instance of any of the provinces known from the WHOIS results smashed together. This seems to be the best approach since the address result is variable, and also there is somewhat a few standards in the results of [PROVINCE], [CITY], Russia and Russia, [PROVINCE], [CITY].

https://bcable.net/x/Rproj/shared

source("shared/geo_provinces.R")

WHOIS Cache Update

Just mass WHOIS + cache with ability to individually update without rerunning everything.

russian_hosts_geo.csv

if(!file.exists("russian_hosts_geo.csv")){
    russian_hosts_whois <- Rwhois::whois_query(russian_hosts$remote_host)
    ret_provinces <- sapply(
        russian_hosts_whois, FUN=function(x){ find_province(
            x$val,
            rus_adm3$NAME_1, ru_map_cleanup_list, ru_mirror_map, ru_mirror_geo
        ) }
    )
    russian_hosts_geo <- russian_hosts
    russian_hosts_geo$province <- ret_provinces
    write.csv(russian_hosts_geo, "russian_hosts_geo.csv", row.names=FALSE)

} else {
    russian_hosts_geo <- read.csv("russian_hosts_geo.csv")

    if(!file.exists("russian_hosts_geo_new.csv")){
        russian_hosts_new <- russian_hosts[
            !(russian_hosts$remote_host %in% russian_hosts_geo$remote_host),
        ]
        russian_hosts_new_whois <- Rwhois::whois_query(
            russian_hosts_new$remote_host
        )
        ret_provinces <- sapply(
            russian_hosts_new_whois, FUN=function(x){ find_province(
                x$val,
                rus_adm3$NAME_1, ru_map_cleanup_list,
                ru_mirror_map, ru_mirror_geo
            ) }
        )
        russian_hosts_new_geo <- russian_hosts_new
        russian_hosts_new_geo$province <- ret_provinces

        russian_hosts_geo <- rbind(
            russian_hosts_geo, russian_hosts_new_geo
        )

        write.csv(russian_hosts_geo, "russian_hosts_geo_new.csv", row.names=FALSE)
    }
}

## Error in rbind(deparse.level, ...): numbers of columns of arguments do not match

Clean Map Data

russian_hosts_geo$merge.col <- cleanup_province(
    russian_hosts_geo$province, ru_map_cleanup_list, ru_mirror_geo
)

russia$merge.col <- cleanup_province(
    russia$name__en, ru_map_cleanup_list, ru_mirror_map
)

Bad map, time to get the belt. Crimea and Sevastopol are in Ukraine:

russia <- russia[!(russia$merge.col %in% c("", "crimea", "sevastopol")),]

Aggregate Provinces

agg_provinces <- aggregate(count ~ merge.col, data=russian_hosts_geo, FUN=sum)

agg_provinces

##        merge.col count
## 1          altay    66
## 2           amur   168
## 3    arkhangelsk  4201
## 4      astrakhan    17
## 5  bashkortostan    11
## 6       belgorod   527
## 7        bryansk  1008
## 8         buryat   207
## 9    chelyabinsk  3992
## 10       chuvash   341
## 11      dagestan   112
## 12        ingush     8
## 13       irkutsk   641
## 14       ivanovo    39
## 15   kaliningrad   141
## 16        kalmyk     4
## 17        kaluga    99
## 18     kamchatka    42
## 19       karelia     5
## 20      kemerovo   349
## 21    khabarovsk   150
## 22       khakass    18
## 23  khantymansiy    30
## 24         kirov   276
## 25          komi  2284
## 26      kostroma  1075
## 27     krasnodar   742
## 28   krasnoyarsk   313
## 29        kurgan     2
## 30         kursk    28
## 31     leningrad   107
## 32       lipetsk    82
## 33      mordovia    64
## 34        moscow 11674
## 35      murmansk    12
## 36    nizhegorod    18
## 37      novgorod  4006
## 38   novosibirsk  1080
## 39          omsk   278
## 40          orel   195
## 41      orenburg   225
## 42         penza   105
## 43          perm  2684
## 44    petersburg  1793
## 45      primorye    18
## 46         pskov     9
## 47        rostov  2079
## 48        ryazan    95
## 49         sakha    46
## 50      sakhalin   539
## 51        samara   811
## 52       saratov  1345
## 53      smolensk    97
## 54     stavropol  1429
## 55    sverdlovsk    69
## 56        tambov     4
## 57     tatarstan   149
## 58         tomsk   202
## 59          tula    22
## 60          tuva     4
## 61          tver   147
## 62        tyumen   346
## 63        udmurt    10
## 64     ulyanovsk   113
## 65      vladimir  3906
## 66     volgograd   153
## 67       vologda   162
## 68      voronezh   966
## 69     yaroslavl    25

Merge Map & Data

russia_data <- merge(russia, agg_provinces, by="merge.col", all.x=TRUE)

Map

sf_russia <- st_as_sf(russia_data, wkt="WKT", crs="WGS84")
sf_russia <- st_shift_longitude(sf_russia)

g <- ggplot(sf_russia)
g <- g + labs(
    title="CO.UA Honeypot: Established Sessions by Russian Region (From WHOIS Data)",
    fill="Sessions"
)
g <- g + scale_fill_viridis_c()
g <- g + geom_sf(aes(geometry=WKT, fill=count))
g <- g + theme_bw()
g <- g + coord_sf(xlim=c(20,190))
g <- g + theme(
    plot.margin = margin(0.2, 0.2, 0.2, 0.2, "cm")
)
g

plot of chunk map_russia_sf

Boilerplate GeoIP Disclaimer

Geolocation based on IP address is not to be taken as entirely accurate as to the source of traffic or attacks conducted. There are many reasons for this, which include (but are not limited to):

Proxies, VPNs, and Tor

Large quantities of traffic, especially attack based traffic, will use a VPN or the Tor network (or some reasonable facsimile), to mask the origin of the traffic. This will in turn change the appearance of the location of origin. Usually, an attacker will also intentionally want the traffic to appear to come from somewhere that has some form of lesser legal jurisdiction, some form of lesser ability to police traffic, or come from a well known source of malicious attacks such as China or Russia.

For instance, the following log entry was generated by myself against my servers while sitting at my desk in the United States, but it gets geolocated as Russia because of how the packet was sent. This sort of masking is trivial to perform, even by a nine year old on a cellphone.

httpd_data[grep("/from/russia/with/logs", httpd_data$Request), c("Request", "Response.Code", "Country.Code")]

##                               Request Response.Code Country.Code
## 1 GET /from/russia/with/logs HTTP/1.1           404           RU

Vulnerable Servers and Botnets

Some locations will have a higher distribution of virtual servers than others, such as Silicon Valley or China. This can lead to larger quantities of vulnerable virtual machines and servers in those regions, and distort the resulting aggregate data.

Government Interference

It is possible that due to address assignment for governmental intelligence purposes or other economic or political reasons a nation could re-allocate address space and forge the identity similarly to a NAT (network address translation). They could also funnel information via VPN technologies for another nation.

Because most of these agreements are made in private, and due to the fact that most geolocation, RDAP, and WHOIS records are based on self-reporting, it is impossible to know the 100% true nature of geographic address assignment.

Weaknesses or errors in MaxMind, rgeolocate, RDAP, or WHOIS

This geolocation uses the rgeolocate package available in CRAN, and uses the internal country database that is shipped with it. There could be an error in the database shipped, there could be an error in the lookup code, etc. Bugs happen. I have no reason to believe that any false geolocation is being performed by these packages, however.

Also used is the self-reported RDAP or WHOIS systems which can frequently be self-reported falsely or misleadingly. Which of the systems (RDAP, WHOIS, or rgeolocate) used are disclosed when necessary.

Final Note

Despite these weaknesses, this doesn't change the fact that looking at this sort of data can be quite fun and interesting, and potentially enlightening. Generalized conclusions should not be made from this data or the maps herein. You have been warned.