KMR
kmrckpt.h
Go to the documentation of this file.
1 /* kmrckpt.h (2014-04-01) */
2 /* Copyright (C) 2012-2016 RIKEN AICS */
3 
4 #ifndef _KMRCKPT_H
5 #define _KMRCKPT_H
6 
7 /** \file kmrckpt.h Checkpoint/Restart Support.
8  Checkpoint/Restart provides two modes; SAVE_ALL, SAVE_SELECTIVE.
9  In SAVE_ALL mode, checkpoints of all key-value stores are taken.
10  IN SAVE_SELECTIVE mode, only checkpoints of user-specified key-value
11  stores are taken. */
12 
13 #define KMR_CKPT_DIRNAME "ckptdir"
14 #define KMR_CKPT_FNAME_PREFIX "kmrckpt"
15 #define KMR_CKPT_PATHLEN 512
16 #define KMR_CKPT_DIRLEN 256
17 #define KMR_CKPT_MSGLEN 512
18 
19 
20 /** Checkpoint modes. KMR_CKPT_ALL takes checkpoints of all key-value
21  stores. KMR_CKPT_SELECTIVE takes checkpoints of key-value stores
22  that user specified. */
24  KMR_CKPT_ALL,
25  KMR_CKPT_SELECTIVE
26 };
27 
28 /* An item in a list */
30  void *val;
31  struct kmr_ckpt_list_item *next;
32  struct kmr_ckpt_list_item *prev;
33 };
34 
35 /* Maximum size of a list */
36 #define KMR_CKPT_LIST_MAX 1000
37 
38 /* allocator for items in a list */
39 typedef void * (*kmr_ckpt_list_alocfn_t)(void *);
40 /* deallocator for items in a list */
41 typedef void (*kmr_ckpt_list_freefn_t)(void *);
42 /* comparater for items in a list */
43 typedef int (*kmr_ckpt_list_compfn_t)(void *, void *);
44 
45 /* Linked list */
46 struct kmr_ckpt_list {
47  struct kmr_ckpt_list_item *head;
48  struct kmr_ckpt_list_item *tail;
49  kmr_ckpt_list_alocfn_t alocfn;
50  kmr_ckpt_list_freefn_t freefn;
51  kmr_ckpt_list_compfn_t compfn;
52  long size; /* size of stored items */
53 };
54 
55 /* A collection that stores kvs transitions. The items in this collection
56  is struct kmr_ckpt_operation. */
58  struct kmr_ckpt_list *chainlst;
59  int chainlst_size;
60 };
61 
62 /* Record of an operation to input and output KVS */
64  long op_seqno;
65  long kvi_id;
66  long kvo_id;
67 };
68 
69 /* Record of map/reduce operation start kv position */
70 struct kv_position {
71  long op_seqno;
72  long start_from;
73 };
74 
75 /* Checkpoint context data */
76 struct kmr_ckpt_ctx {
77  char ckpt_dname[KMR_CKPT_DIRLEN]; /* checkpoint directory name */
78  int prev_mode; /* checkpoint mode of previous run */
79  FILE *ckpt_log_fp; /* ckpt log file pointer */
80  long progress_counter; /* MapReduce progress counter */
81  long prev_progress; /* progress of previous run */
82  long prev_global_progress; /* global progress of previous run */
83  long cur_kvi_id; /* ID of current targeted KVI */
84  long cur_kvo_id; /* ID of current targeted KVO */
85  FILE *ckpt_data_fp; /* file pointer to a ckpt data file
86  that is currently targeted */
87  long saved_element_count; /* count of key-values in a KVS
88  that are already written to a
89  ckpt data file */
90  void *saved_adding_point; /* key-value position in a KVS from
91  which writing to a ckpt data file
92  starts */
93  void *saved_current_block; /* key-values block in a KVS from
94  which writing to a ckpt data file
95  starts */
96  struct kv_position *kv_positions; /* start kv position in map/reduce */
97  int kv_positions_count; /* count of kv_positions */
98  int lock_id; /* ckpt lock id */
99  int lock_counter; /* ckpt lock counter */
100  _Bool initialized; /* 1 if ckpt_ctx is initialised */
101  _Bool slct_cur_take_ckpt; /* [selective mode]
102  if 1, ckpt data of the output KVS
103  generated by current operation will
104  be taken */
105  struct kmr_ckpt_list *slct_skip_ops;
106  /* [selective mode]
107  list of id of operation that can
108  be skepped when restarted */
109 };
110 
111 /* Checkpoint data file header */
113  char magic[8]; /* magic number */
114  char info[4]; /* misc info (ex. compressed etc.) */
115  int nprocs; /* MPI communicator size*/
116  int rank; /* MPI rank */
117  long kvs_id; /* ID of saved kvs */
118  enum kmr_kv_field key_data;
119  enum kmr_kv_field value_data;
120  struct kmr_kvs_entry data[1];
121 };
122 
123 /* Log entry type */
124 enum kmr_ckpt_log_state {
125  KMR_CKPT_LOG_WHOLE_START,
126  KMR_CKPT_LOG_WHOLE_FINISH,
127  KMR_CKPT_LOG_BLOCK_START,
128  KMR_CKPT_LOG_BLOCK_ADD,
129  KMR_CKPT_LOG_BLOCK_FINISH,
130  KMR_CKPT_LOG_INDEX_START,
131  KMR_CKPT_LOG_INDEX_ADD,
132  KMR_CKPT_LOG_INDEX_FINISH,
133  KMR_CKPT_LOG_DELETE_START,
134  KMR_CKPT_LOG_DELETE_FINISH,
135  KMR_CKPT_LOG_DELETABLE,
136  KMR_CKPT_LOG_PROGRESS,
137  KMR_CKPT_LOG_SKIPPED,
138  KMR_CKPT_LOG_LOCK_START,
139  KMR_CKPT_LOG_LOCK_FINISH
140 };
141 
142 /* Checkpoint log entry */
144  long op_seqno; /* operation sequence number */
145  long kvi_id;
146  long kvo_id;
147  int state;
148  long n_kvi; /* number of processed kv in kvi */
149  long n_kvo; /* number of generated kv in kvo */
150 };
151 
152 /* Checkpoint log file header */
153 struct kmr_ckpt_log {
154  char magic[8]; /* magic number */
155  char info[4]; /* misc info (ex. compressed etc.) */
156  int mode; /* checkpoint mode */
157  int nprocs; /* MPI communicator size */
158  int rank; /* MPI rank */
159  struct kmr_ckpt_log_entry data[1];
160 };
161 
162 /* Checkpoint file types */
163 enum kmr_ckpt_type {
164  KMR_CKPT_DATA,
165  KMR_CKPT_LOG
166 };
167 
168 /* Metadata of a checkpoint data file. */
170  long kvs_id;
171  _Bool checked; /* if 1, check of restoring data
172  has been done */
173  _Bool merged; /* if 1, the data file will be
174  merged */
175  char fname[KMR_CKPT_PATHLEN];
176  char dname[KMR_CKPT_DIRLEN];
177 };
178 
179 /* Information about a merged target data */
181  int rank;
182  long n_kvi;
183  long n_kvo;
184  long *done_ikv_lst; /* list of processed input kv */
185  long done_ikv_lst_size; /* size of done_ikv_lst */
186  long kvi_op_seqno; /* op_seqno when used as kvi */
187  struct kmr_ckpt_data_file *file;
188 };
189 
190 /* A merge of checkpoint data from previous ranks */
192  int rank; /* rank that merges data */
193  long kvs_id; /* id of kvs whose data are merged */
194  struct kmr_ckpt_merge_source *src_lst; /* merged data list */
195  int src_lst_count; /* size of src_lst */
196 };
197 
198 /* Checkpoint merge context data */
200  int max_each_merge; /* maximum number of merged data
201  of each merge */
202  struct kmr_ckpt_merge *merges; /* list of merges */
203  int merges_count; /* size of merges */
204 };
205 
206 /* State of a rank in previous run */
208  int prev_rank; /* rank of previous run */
209  int prev_nprocs; /* number of processes used in previous run */
210  char *ckpt_dir; /* checkpoint file directory */
211  struct kmr_ckpt_data_file *dataflst; /* checkpoint data files */
212  int dataflst_size; /* number of dataflst */
213 };
214 
215 /* Dummy kvs id for null kvs */
216 #define KMR_CKPT_DUMMY_ID 0
217 
218 /*
219 Copyright (C) 2012-2016 RIKEN AICS
220 This library is distributed WITHOUT ANY WARRANTY. This library can be
221 redistributed and/or modified under the terms of the BSD 2-Clause License.
222 */
223 
224 #endif /* _KMRCKPT_H */
Definition: kmr.h:348
kmr_kv_field
Datatypes of Keys or Values.
Definition: kmr.h:325
kmr_ckpt_mode
Checkpoint modes.
Definition: kmrckpt.h:23
Definition: kmrckpt.h:143