Distances on Directed Graphs in R
1#include "graph.h"
2
3
4//' sample_one_edge_no_comps
5//'
6//' Sample one edge for graph that has no pre-calculated components. Only used
7//' in \code{sample_one_vertex}
8//'
9//' @param edge_map edge_map
10//' @return std::vector of 2 elements: [0] with value of largest connected
11//' component; [1] with random index to one edge that is part of that component.
12//' @noRd
13edge_component graph_sample::sample_one_edge_no_comps (vertex_map_t &vertices,
14 edge_map_t &edge_map)
15{
16 // TDOD: FIX edge_id_t type defs here!
17 std::unordered_map <vertex_id_t, size_t> components;
18
19 size_t largest_component =
20 graph::identify_graph_components (vertices, components);
21
22 bool in_largest = false;
23 long int e0 = static_cast <long int> (floor (R::runif (0.0,
24 static_cast <double> (edge_map.size ()) - 1.0)));
25 while (!in_largest)
26 {
27 // TODO: The following is an O(N) lookup; maybe just use
28 // edge_map.find (std::to_string (e0)) and handle not found; that'd be
29 // constant time.
30 edge_t this_edge = std::next (edge_map.begin (), e0++)->second;
31 vertex_id_t this_vert = this_edge.get_from_vertex ();
32 if (components [this_vert] == largest_component)
33 in_largest = true;
34 if (e0 >= static_cast <long int> (edge_map.size ()))
35 e0 = 0; // # nocov
36 }
37
38 edge_component edge_comp;
39 edge_comp.component = largest_component;
40 edge_comp.edge = std::next (edge_map.begin (), e0)->first;
41
42 return edge_comp;
43}
44
45//' sample_one_edge_with_comps
46//'
47//' Sample one edge for graph that has pre-calculated components. Only used in
48//' \code{sample_one_vertex}
49//'
50//' @param edge_map edge_map
51//' @return Random index to one edge that is part of the largest connected
52//' component.
53//' @noRd
54edge_id_t graph_sample::sample_one_edge_with_comps (Rcpp::DataFrame graph,
55 edge_map_t &edge_map)
56{
57 Rcpp::NumericVector component = graph ["component"];
58 std::uniform_int_distribution <size_t> uni (0,
59 static_cast <size_t> (graph.nrow ()) - 1);
60 long int e0 = static_cast <long int> (floor (R::runif (0.0,
61 static_cast <double> (edge_map.size ()) - 1.0)));
62 while (component (static_cast <size_t> (e0)) > 1L) // can't be invoked in tests in a controlled way
63 e0 = static_cast <long int> (floor (R::runif (0.0,
64 static_cast <double> (edge_map.size ()) - 1.0)));
65
66 return std::next (edge_map.begin (), e0)->first;
67}
68
69vertex_id_t graph_sample::select_random_vert (Rcpp::DataFrame graph,
70 edge_map_t &edge_map, vertex_map_t &vertices)
71{
72 vertex_id_t this_vert;
73 if (graph::graph_has_components (graph))
74 {
75 edge_id_t e0 = graph_sample::sample_one_edge_with_comps (graph, edge_map);
76 edge_t this_edge = edge_map.find (e0)->second;
77 this_vert = this_edge.get_from_vertex ();
78 } else
79 {
80 edge_component edge_comp =
81 graph_sample::sample_one_edge_no_comps (vertices, edge_map);
82 edge_t this_edge = edge_map.find (edge_comp.edge)->second; // random edge
83 this_vert = this_edge.get_from_vertex ();
84 }
85
86 return this_vert;
87}
88
89
90//' rcpp_sample_graph
91//'
92//' Randomly sample one connected componnent of a graph
93//'
94//' @param graph graph to be processed
95//' @param nverts_to_sample Number of vertices to sample
96//' @param quiet If TRUE, display progress
97//'
98//' @return Smaller sub-set of \code{graph}
99//'
100//' @noRd
101// [[Rcpp::export]]
102Rcpp::StringVector rcpp_sample_graph (Rcpp::DataFrame graph,
103 size_t nverts_to_sample)
104{
105 vertex_map_t vertices;
106 edge_map_t edge_map;
107 vert2edge_map_t vert2edge_map;
108
109 bool has_times = graph::graph_from_df (graph, vertices, edge_map, vert2edge_map);
110 has_times = false; // rm unused variable warning
111
112 Rcpp::StringVector edges_out;
113 if (vertices.size () <= nverts_to_sample)
114 return edges_out; // return empty vector # nocov
115
116 std::unordered_map <vertex_id_t, size_t> components;
117 size_t largest_component =
118 graph::identify_graph_components (vertices, components);
119 // simple vert_ids set for quicker random selection:
120 std::unordered_set <vertex_id_t> vert_ids;
121 for (auto v: vertices)
122 if (components [v.first] == largest_component)
123 vert_ids.emplace (v.first);
124 if (vert_ids.size () < nverts_to_sample)
125 {
126 Rcpp::Rcout << "Largest connected component only has " << // # nocov
127 vert_ids.size () << " vertices" << std::endl; // # nocov
128 nverts_to_sample = static_cast <size_t> (vert_ids.size ()); // # nocov
129 }
130
131 vertex_id_t this_vert =
132 graph_sample::select_random_vert (graph, edge_map, vertices);
133
134 // Samples are built by randomly tranwing a vertex list, and inspecting
135 // edges that extend from it. The only effective way to randomly sample a
136 // C++ container is with a std::vector, even though that requires a
137 // std::find each time prior to insertion. It's also useful to know the
138 // size, so this vector is **NOT** reserved, even though it easily could be.
139 // Maybe not the best solution?
140 std::vector <vertex_id_t> vertlist;
141 std::unordered_set <edge_id_t> edgelist;
142 vertlist.push_back (this_vert);
143
144
145 size_t count = 0;
146 while (vertlist.size () < nverts_to_sample)
147 {
148 // initialise random int generator:
149 // TODO: Is this quicker to use a single unif and round each time?
150 std::uniform_int_distribution <size_t> uni (0,
151 static_cast <size_t> (vertlist.size ()) - 1);
152 size_t randv = static_cast <size_t> (floor (R::runif (0.0,
153 static_cast <double> (vertlist.size ()) - 1.0)));
154 this_vert = vertlist [randv];
155
156 std::unordered_set <edge_id_t> edges = vert2edge_map [this_vert];
157 for (auto e: edges)
158 {
159 edgelist.insert (e);
160 edge_t this_edge = edge_map.find (e)->second;
161 vertex_id_t vt = this_edge.get_from_vertex ();
162 if (std::find (vertlist.begin(), vertlist.end(), vt) ==
163 vertlist.end())
164 vertlist.push_back (vt);
165 if (vertlist.size () >= nverts_to_sample)
166 {
167 break;
168 } else
169 {
170 vt = this_edge.get_to_vertex ();
171 if (std::find (vertlist.begin(), vertlist.end(), vt) ==
172 vertlist.end())
173 vertlist.push_back (vt);
174 }
175 if (vertlist.size () >= nverts_to_sample)
176 break;
177 }
178 count++;
179 if (count > (100 * nverts_to_sample))
180 {
181 // likely stuck in some one-way part of graph that can't connect, so
182 // reset to another start node
183 // # nocov start
184 edgelist.clear ();
185 this_vert =
186 graph_sample::select_random_vert (graph, edge_map, vertices);
187 vertlist.clear ();
188 vertlist.push_back (this_vert);
189 count = 0;
190 // # nocov end
191 }
192 }
193
194 size_t nedges = static_cast <size_t> (edgelist.size ());
195
196 // edgelist is an unordered set, so has to be iteratively inserted
197 edges_out = Rcpp::StringVector (nedges);
198 size_t i = 0;
199 for (auto e: edgelist)
200 edges_out (i++) = e;
201
202 return edges_out;
203}