Suppose you want to calculate personalized PageRanks for some graph
that is not supported by aPPR
. You can extend
aPPR
to arbitrary graph objects, so long the graph object
is an S3 object with methods:
check()
node_degrees()
neighborhood()
-
appr()
(optional)
See the documentation for those S3 generics to understand the generic specification fully!
We demonstrate how to implement these methods below for a new, custom
graph object. In this case, we will consider the PubMed citation
network, which we will interact via the PubMed API, using the
rentrez
package. First, we define a constructor function
that returns a custom S3 graph object that subclasses
abstract_graph
. You can install rentrez with
pak::pak("ropensci/rentrez")
, and do not need to set up any
authentication to begin using the API.
library(aPPR)
library(logger)
library(glue)
library(rentrez)
# constructor for PubMed graph object (defined over API)
pubmed_graph <- function(max_attempts = 3) {
if (!requireNamespace("rentrez", quietly = TRUE)) {
stop(
"`rentrez` package must be installed to use `pubmed_graph()`",
call. = FALSE
)
}
agraph <- abstract_graph(
subclass = "pubmed",
max_attempts = max_attempts
)
agraph
}
graph <- pubmed_graph()
graph
#> Abstract graph object (subclass: pubmed)
Now we want to implement S3 methods for the pubmed
object. In some cases, you can query data from a graph in large batches,
but with the PubMed API it simpler (at least to my limited knowledge) to
query node by node, with no bulk lookups. It turns out that we can get
neighborhoods and node in-degree and node out-degree all at once, using
rentrez::entrez_link()
. We give the function three attempts
(by default) to successfully complete this API call, since APIs
sometimes fail. Then, since we will need this information repeatedly, we
memoize the function, to avoid repeated calls to the API.
# one node at a time
get_pubmed_data <- function(graph, node) {
for (i in 1:graph$max_attempts) {
log_trace(
glue("Attempt {i}/{graph$max_attempts} to get node degrees: {node}")
)
tryCatch(
{
cites <- entrez_link(dbfrom = "pubmed", db = "all", id = node)
break
},
error = function(cnd) {
if (i == graph$max_attempts) {
log_debug(
glue("Maximum attempts to find neighborhood met, could not find: {node}")
)
stop("Couldn't pull data for node")
}
}
)
}
data <- list(
refs = unique(cites$links$pubmed_pubmed_refs),
citedby = unique(cites$links$pubmed_pubmed_citedin)
)
data$num_refs <- length(data$refs)
data$num_citedby <- length(data$citedby)
data
}
memo_get_pubmed_data <- memoise::memoise(get_pubmed_data)
Now we test the function. I’m not currently sure that it’s working: it’s suspicious for two papers to have in-degree and out-degree all equal to 18 – we need to dig into this and find out if the API is limited to returning a maximum of API results in a single call, for example.
good_node_ids <- c("30345262", "29624432", "29867837")
bad_node_id <- "I am a pumpkin"
mixed_node_ids <- c(good_node_ids, bad_node_id)
# this is suspicious to me, something seems wrong here
memo_get_pubmed_data(graph, good_node_ids[1])
#> $refs
#> [1] "28956619" "27755345" "24936257" "24059552" "23149807" "22371471"
#> [7] "22128211" "21548822" "20840052" "20836729" "20689101" "20036928"
#> [13] "19879747" "19783003" "19718579" "18792926" "18477059" "18256996"
#> [19] "17943776" "17920516" "17499247" "17127265" "16601269" "16556615"
#> [25] "16061595" "15687416" "15665383" "15499145" "15177673" "14725946"
#> [31] "12667636" "12496220" "11150646" "11041438" "10386225" "10225288"
#> [37] "9768331" "9691157" "9227580" "9145435" "9131263" "8525349"
#> [43] "7999032" "7733322" "7720662" "7603298" "6607683" "3029864"
#>
#> $citedby
#> [1] "35455075" "33190629" "32785463" "31979022"
#>
#> $num_refs
#> [1] 48
#>
#> $num_citedby
#> [1] 4
# suspicious that the number of in-cites and out-cites matches, and that it
# matches across both papers! TODO: investigate!
memo_get_pubmed_data(graph, good_node_ids[2])
#> $refs
#> NULL
#>
#> $citedby
#> [1] "38693465" "38284617" "36892193" "36293784" "36111739" "35799170"
#> [7] "35757994" "35565846" "34211314" "34205900" "33976584" "33210456"
#> [13] "32911625" "31835736" "31236026" "31137691" "30882237" "30592165"
#> [19] "30234121" "26061694" "15888210"
#>
#> $num_refs
#> [1] 0
#>
#> $num_citedby
#> [1] 21
memo_get_pubmed_data(graph, good_node_ids[3])
#> $refs
#> [1] "28418291" "28159212" "28135118" "28103607" "28088062" "27936476"
#> [7] "27889615" "27831549" "27755345" "27684560" "27566870" "27490567"
#> [13] "26903304" "26741949" "26334428" "26238662" "26190380" "26116636"
#> [19] "26086569" "25794697" "25769062" "25728542" "25313944" "24767058"
#> [25] "24752468" "24376898" "24200019" "24169308" "24084373" "23805943"
#> [31] "23769834" "23713111" "23274339" "23271066" "23178065" "23178060"
#> [37] "22371471" "22245400" "21921811" "21638200" "21622093" "21247481"
#> [43] "20946146" "20509779" "19832716" "19516191" "19414822" "19127217"
#> [49] "18845249" "18477059" "18422617" "18419467" "17868474" "17661617"
#> [55] "17517714" "16266685" "15886342" "14657823" "14633102" "12732956"
#> [61] "11740876" "10791723" "10432587" "8489717" "8274003" "6323663"
#> [67] "318682"
#>
#> $citedby
#> [1] "38613123" "38455872" "38129684" "37060103" "36330365" "36193380"
#> [7] "35874591" "35454995" "35127595" "35124754" "33623528" "33546253"
#> [13] "33276234" "31803156" "31137691" "30873395" "30770541" "30560111"
#> [19] "30443539" "30356694" "30319659"
#>
#> $num_refs
#> [1] 67
#>
#> $num_citedby
#> [1] 21
# check that we handle bad node ids in some reliable way, in this case
# it looks we get empty results
memo_get_pubmed_data(graph, bad_node_id)
#> $refs
#> NULL
#>
#> $citedby
#> NULL
#>
#> $num_refs
#> [1] 0
#>
#> $num_citedby
#> [1] 0
#' Check method for `pubmed` graph objects
#'
#' @param graph A `pubmed` graph object
#' @param nodes A **character** vector of node ids. **Can be empty!**
#'
#' @return A **character** vector of node ids that we can reach in the graph.
#' For example, some nodes ids may not be reachable due to API failures,
#' or, more generally, permissions failures.
#'
#' If `nodes` is the empty vector, returns the empty vector. Be sure to
#' handle this edge case.
check.pubmed <- function(graph, nodes) {
log_debug(glue("Checking nodes"))
# handle the case where no nodes are passed
if (length(nodes) < 1) {
return(character(0))
}
good_nodes <- character(0)
for (node in nodes) {
node_data <- memo_get_pubmed_data(graph, node)
# this is a sufficient check to see if (1) the node is in pubmed, (2)
# we can pull it's neighborhood, and (3) it has at least one
# incoming or outgoing citation
if (node_data$num_refs + node_data$num_citedby > 0) {
log_trace(glue("Checked node: {node} (good)"))
good_nodes <- c(good_nodes, node)
next
}
log_trace(glue("Checked node: {node} (bad)"))
}
good_nodes
}
Now we test our implementation. To do this, we should give at least one good node id, and at least one bad node id. Only the good node id should be returned.
check(graph, good_node_ids)
#> [1] "30345262" "29624432" "29867837"
check(graph, bad_node_id)
#> character(0)
check(graph, mixed_node_ids)
#> [1] "30345262" "29624432" "29867837"
#' Degree method for `pubmed` graph objects
#'
#' @param graph A `pubmed` graph object
#' @param nodes A **character** vector of node ids. **Cannot be empty.** Should
#' not contain duplicates if `check()` is properly implemented and does
#' not output duplicates.
#'
#' @return A list, with two elements, `in_degree` and `out_degree`. Both
#' should be the same length as `nodes`, and match the order of `nodes`.
#'
node_degrees.pubmed <- function(graph, nodes) {
log_debug(glue("Getting node degrees"))
degrees <- list(
in_degree = integer(length(nodes)),
out_degree = integer(length(nodes))
)
for (i in seq_along(nodes)) {
log_debug(glue("Getting node degrees for node: {nodes[i]}"))
node_data <- memo_get_pubmed_data(graph, nodes[i])
# must treat pubmed like an undirected graph. otherwise it's a citation
# network, and thus a tree, and thus no pair of nodes is mutually
# reachable, thus pagerank is not defined
degrees$in_degree[i] <- node_data$num_citedby + node_data$num_refs
degrees$out_degree[i] <- node_data$num_citedby + node_data$num_refs
log_trace(glue("In-degree for node: {degrees$in_degree[i]}"))
log_trace(glue("Out-degree for node: {degrees$out_degree[i]}"))
}
log_debug(glue("Done getting node degrees"))
degrees
}
To test this method, we should pass a character vector of several good node ids.
# test with a single node
node_degrees(graph, good_node_ids[1])
#> $in_degree
#> [1] 52
#>
#> $out_degree
#> [1] 52
# test with multiple nodes! this is the key one! this is suspicious, and
# means we need to check if our function memo_get_pubmed_data() is working
node_degrees(graph, good_node_ids)
#> $in_degree
#> [1] 52 21 88
#>
#> $out_degree
#> [1] 52 21 88
#' Neighborhood method for `pubmed` graph objects
#'
#' @param graph A `pubmed` graph object
#' @param nodes A length one character vector, for a node in the graph with
#' at least one outgoing edge.
#'
#' @return A **character** vector of node ids for the graph neighborhood.
#' Should be a vector of length at least one (if the `check()` method was
#' implemented correctly), and should not contain duplicates.
neighborhood.pubmed <- function(graph, node) {
if (length(node) > 1) {
stop("`node` must be a character vector of length one.")
}
log_debug(glue("Getting neighborhood: {node}"))
node_data <- memo_get_pubmed_data(graph, node)
log_debug(glue("Done getting neighborhood: {node}"))
unique(node_data$refs, node_data$citedby)
}
neighborhood(graph, good_node_ids[1])
#> [1] "28956619" "27755345" "24936257" "24059552" "23149807" "22371471"
#> [7] "22128211" "21548822" "20840052" "20836729" "20689101" "20036928"
#> [13] "19879747" "19783003" "19718579" "18792926" "18477059" "18256996"
#> [19] "17943776" "17920516" "17499247" "17127265" "16601269" "16556615"
#> [25] "16061595" "15687416" "15665383" "15499145" "15177673" "14725946"
#> [31] "12667636" "12496220" "11150646" "11041438" "10386225" "10225288"
#> [37] "9768331" "9691157" "9227580" "9145435" "9131263" "8525349"
#> [43] "7999032" "7733322" "7720662" "7603298" "6607683" "3029864"
neighborhood(graph, good_node_ids[2])
#> NULL
neighborhood(graph, good_node_ids[3])
#> [1] "28418291" "28159212" "28135118" "28103607" "28088062" "27936476"
#> [7] "27889615" "27831549" "27755345" "27684560" "27566870" "27490567"
#> [13] "26903304" "26741949" "26334428" "26238662" "26190380" "26116636"
#> [19] "26086569" "25794697" "25769062" "25728542" "25313944" "24767058"
#> [25] "24752468" "24376898" "24200019" "24169308" "24084373" "23805943"
#> [31] "23769834" "23713111" "23274339" "23271066" "23178065" "23178060"
#> [37] "22371471" "22245400" "21921811" "21638200" "21622093" "21247481"
#> [43] "20946146" "20509779" "19832716" "19516191" "19414822" "19127217"
#> [49] "18845249" "18477059" "18422617" "18419467" "17868474" "17661617"
#> [55] "17517714" "16266685" "15886342" "14657823" "14633102" "12732956"
#> [61] "11740876" "10791723" "10432587" "8489717" "8274003" "6323663"
#> [67] "318682"
Lastly, you can optionally implement an appr
method for
your abstract graph subclass. In the appr
method for the
subclass, you can do things like:
- Add functionality to convert a convenient seed node name (in this,
possibly something like a DOI) into the internal node name
representation (see
appr.rtweet_graph()
for an example of this) - Checks that you have appropriate authorization to pull information about the seed nodes
- Etc, etc
This custom subclass method will run before the general
appr.abstract_graph()
. We don’t have a particular need to
do anything of that here, so we do not.
Debugging
If you are accessing a graph over an API, it’s likely that you will
encounter edge cases where the API returns no data, or data in a format
that you did not expect. We highly recommend using logging to debug your
implementation when this happens, using the logger
library.
See that logger
documentation for details.
Find any errors, fix, and rinse and repeat until you’ve completed the likely unpleasant task of tracking down all the edges cases in the API. In our, we don’t seem to find any edge cases right away.
library(logger)
# set logging threshold for code you just wrote, if desired
log_threshold(TRACE)
# set logging threshold for aPPR package functions, if desired
log_threshold(DEBUG, namespace = "aPPR")
appr(
graph, # the graph to work with
seeds = good_node_ids[1], # name of seed node (character)
epsilon = 0.0005, # desired approximation quality
max_visits = 10 # bound computation since this is an example
)
#> Warning: Maximum visits reached. Finishing aPPR calculation early.
#> Personalized PageRank Approximator
#> ----------------------------------
#>
#> - number of seeds: 1
#> - unique nodes visited so far: 10 out of maximum of 10
#> - total visits so far: 11
#> - bad nodes so far: 0
#>
#> - teleportation constant (alpha): 0.15
#> - desired approximation error (epsilon): 5e-04
#> - achieved bound on approximation error: 0.00883575883575884
#> - length of to visit list: 19
#>
#> PPR table (see $stats field):
#> # A tibble: 49 × 7
#> name regularized p in_degree out_degree degree_adjusted r
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 30345262 0.000517 0.0811 52 52 0.00156 0.459
#> 2 28956619 0 0 1 1 0 0.00884
#> 3 27755345 0 0 33 33 0 0.00884
#> 4 24936257 0 0 88 88 0 0.00884
#> 5 24059552 0 0 15 15 0 0.00884
#> 6 23149807 0.00000602 0.000716 14 14 0.0000512 0.00406
#> 7 22371471 0 0 1321 1321 0 0.00884
#> 8 22128211 0 0 33 33 0 0.00884
#> 9 21548822 0 0 20 20 0 0.00884
#> 10 20840052 0 0 53 53 0 0.00884
#> # ℹ 39 more rows