001 /** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 019 package org.apache.hadoop.mapreduce; 020 021 import java.io.IOException; 022 023 import org.apache.hadoop.classification.InterfaceAudience; 024 import org.apache.hadoop.classification.InterfaceStability; 025 /** 026 * <code>OutputCommitter</code> describes the commit of task output for a 027 * Map-Reduce job. 028 * 029 * <p>The Map-Reduce framework relies on the <code>OutputCommitter</code> of 030 * the job to:<p> 031 * <ol> 032 * <li> 033 * Setup the job during initialization. For example, create the temporary 034 * output directory for the job during the initialization of the job. 035 * </li> 036 * <li> 037 * Cleanup the job after the job completion. For example, remove the 038 * temporary output directory after the job completion. 039 * </li> 040 * <li> 041 * Setup the task temporary output. 042 * </li> 043 * <li> 044 * Check whether a task needs a commit. This is to avoid the commit 045 * procedure if a task does not need commit. 046 * </li> 047 * <li> 048 * Commit of the task output. 049 * </li> 050 * <li> 051 * Discard the task commit. 052 * </li> 053 * </ol> 054 * The methods in this class can be called from several different processes and 055 * from several different contexts. It is important to know which process and 056 * which context each is called from. Each method should be marked accordingly 057 * in its documentation. 058 * 059 * @see org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter 060 * @see JobContext 061 * @see TaskAttemptContext 062 */ 063 @InterfaceAudience.Public 064 @InterfaceStability.Stable 065 public abstract class OutputCommitter { 066 /** 067 * For the framework to setup the job output during initialization. This is 068 * called from the application master process for the entire job. 069 * 070 * @param jobContext Context of the job whose output is being written. 071 * @throws IOException if temporary output could not be created 072 */ 073 public abstract void setupJob(JobContext jobContext) throws IOException; 074 075 /** 076 * For cleaning up the job's output after job completion. This is called 077 * from the application master process for the entire job. 078 * 079 * @param jobContext Context of the job whose output is being written. 080 * @throws IOException 081 * @deprecated Use {@link #commitJob(JobContext)} and 082 * {@link #abortJob(JobContext, JobStatus.State)} instead. 083 */ 084 @Deprecated 085 public void cleanupJob(JobContext jobContext) throws IOException { } 086 087 /** 088 * For committing job's output after successful job completion. Note that this 089 * is invoked for jobs with final runstate as SUCCESSFUL. This is called 090 * from the application master process for the entire job. 091 * 092 * @param jobContext Context of the job whose output is being written. 093 * @throws IOException 094 */ 095 public void commitJob(JobContext jobContext) throws IOException { 096 cleanupJob(jobContext); 097 } 098 099 100 /** 101 * For aborting an unsuccessful job's output. Note that this is invoked for 102 * jobs with final runstate as {@link JobStatus.State#FAILED} or 103 * {@link JobStatus.State#KILLED}. This is called from the application 104 * master process for the entire job. 105 * 106 * @param jobContext Context of the job whose output is being written. 107 * @param state final runstate of the job 108 * @throws IOException 109 */ 110 public void abortJob(JobContext jobContext, JobStatus.State state) 111 throws IOException { 112 cleanupJob(jobContext); 113 } 114 115 /** 116 * Sets up output for the task. This is called from each individual task's 117 * process that will output to HDFS, and it is called just for that task. 118 * 119 * @param taskContext Context of the task whose output is being written. 120 * @throws IOException 121 */ 122 public abstract void setupTask(TaskAttemptContext taskContext) 123 throws IOException; 124 125 /** 126 * Check whether task needs a commit. This is called from each individual 127 * task's process that will output to HDFS, and it is called just for that 128 * task. 129 * 130 * @param taskContext 131 * @return true/false 132 * @throws IOException 133 */ 134 public abstract boolean needsTaskCommit(TaskAttemptContext taskContext) 135 throws IOException; 136 137 /** 138 * To promote the task's temporary output to final output location. 139 * If {@link #needsTaskCommit(TaskAttemptContext)} returns true and this 140 * task is the task that the AM determines finished first, this method 141 * is called to commit an individual task's output. This is to mark 142 * that tasks output as complete, as {@link #commitJob(JobContext)} will 143 * also be called later on if the entire job finished successfully. This 144 * is called from a task's process. 145 * 146 * @param taskContext Context of the task whose output is being written. 147 * @throws IOException if commit is not successful. 148 */ 149 public abstract void commitTask(TaskAttemptContext taskContext) 150 throws IOException; 151 152 /** 153 * Discard the task output. This is called from a task's process to clean 154 * up a single task's output that can not yet been committed. 155 * 156 * @param taskContext 157 * @throws IOException 158 */ 159 public abstract void abortTask(TaskAttemptContext taskContext) 160 throws IOException; 161 162 /** 163 * Is task output recovery supported for restarting jobs? 164 * 165 * If task output recovery is supported, job restart can be done more 166 * efficiently. 167 * 168 * @return <code>true</code> if task output recovery is supported, 169 * <code>false</code> otherwise 170 * @see #recoverTask(TaskAttemptContext) 171 */ 172 public boolean isRecoverySupported() { 173 return false; 174 } 175 176 /** 177 * Recover the task output. 178 * 179 * The retry-count for the job will be passed via the 180 * {@link MRJobConfig#APPLICATION_ATTEMPT_ID} key in 181 * {@link TaskAttemptContext#getConfiguration()} for the 182 * <code>OutputCommitter</code>. This is called from the application master 183 * process, but it is called individually for each task. 184 * 185 * If an exception is thrown the task will be attempted again. 186 * 187 * @param taskContext Context of the task whose output is being recovered 188 * @throws IOException 189 */ 190 public void recoverTask(TaskAttemptContext taskContext) 191 throws IOException 192 {} 193 }